MATH 422: Time Series Analysis

Fall 2022

Read data

setwd('/Users/mandyhong/Desktop/MATH 422')
allHC = read.csv('all_race_hc_ts.csv')
blackHC=read.csv('black_hc_ts.csv')
asianHC=read.csv('asian_hc_ts.csv')
unemp=read.csv('unemployment.csv')
cpi=read.csv('cpi.csv')
temp=read.csv('avg_temp.csv')
temp=temp[-c(1:4),]
temp=temp[c(1:360),]
temp=as.numeric(temp$Contiguous.U.S.)
temp=as.ts(temp)

non_rHC=read.csv('non_racial_hc_ts.csv')

Modify data

unemp=data.frame(t(unemp))
df= data.frame("year"=c(rep(1991, 12)))
for (i in 1992:2020) {
  df2= data.frame("year"=c(rep(i, 12)))
  df=rbind(df,df2)
}


df_rate=data.frame("unemployment"=unemp$X1[2:13])
for (i in 2:30){
  col_name <- paste0("X",i)
  df_rate2=data.frame("unemployment"=unemp[col_name][2:13,])
  df_rate=rbind(df_rate, df_rate2)
}

df_month=data.frame("month"=c(rep(1:12,30)))

unempDF=data.frame("year"=df$year,"month"=df_month$month,"unemployment"=df_rate$unemployment)
unempDF
##     year month unemployment
## 1   1991     1          7.1
## 2   1991     2          7.3
## 3   1991     3          7.2
## 4   1991     4          6.5
## 5   1991     5          6.7
## 6   1991     6          7.0
## 7   1991     7          6.8
## 8   1991     8          6.6
## 9   1991     9          6.5
## 10  1991    10          6.5
## 11  1991    11          6.7
## 12  1991    12          6.9
## 13  1992     1          8.1
## 14  1992     2          8.2
## 15  1992     3          7.8
## 16  1992     4          7.2
## 17  1992     5          7.3
## 18  1992     6          8.0
## 19  1992     7          7.7
## 20  1992     8          7.4
## 21  1992     9          7.3
## 22  1992    10          6.9
## 23  1992    11          7.1
## 24  1992    12          7.1
## 25  1993     1          8.0
## 26  1993     2          7.8
## 27  1993     3          7.4
## 28  1993     4          6.9
## 29  1993     5          6.8
## 30  1993     6          7.2
## 31  1993     7          7.0
## 32  1993     8          6.6
## 33  1993     9          6.4
## 34  1993    10          6.4
## 35  1993    11          6.2
## 36  1993    12          6.1
## 37  1994     1          7.3
## 38  1994     2          7.1
## 39  1994     3          6.8
## 40  1994     4          6.2
## 41  1994     5          5.9
## 42  1994     6          6.2
## 43  1994     7          6.2
## 44  1994     8          5.9
## 45  1994     9          5.6
## 46  1994    10          5.4
## 47  1994    11          5.3
## 48  1994    12          5.1
## 49  1995     1          6.2
## 50  1995     2          5.9
## 51  1995     3          5.7
## 52  1995     4          5.6
## 53  1995     5          5.5
## 54  1995     6          5.8
## 55  1995     7          5.9
## 56  1995     8          5.6
## 57  1995     9          5.4
## 58  1995    10          5.2
## 59  1995    11          5.3
## 60  1995    12          5.2
## 61  1996     1          6.3
## 62  1996     2          6.0
## 63  1996     3          5.8
## 64  1996     4          5.4
## 65  1996     5          5.4
## 66  1996     6          5.5
## 67  1996     7          5.6
## 68  1996     8          5.1
## 69  1996     9          5.0
## 70  1996    10          4.9
## 71  1996    11          5.0
## 72  1996    12          5.0
## 73  1997     1          5.9
## 74  1997     2          5.7
## 75  1997     3          5.5
## 76  1997     4          4.8
## 77  1997     5          4.7
## 78  1997     6          5.2
## 79  1997     7          5.0
## 80  1997     8          4.8
## 81  1997     9          4.7
## 82  1997    10          4.4
## 83  1997    11          4.3
## 84  1997    12          4.4
## 85  1998     1          5.2
## 86  1998     2          5.0
## 87  1998     3          5.0
## 88  1998     4          4.1
## 89  1998     5          4.2
## 90  1998     6          4.7
## 91  1998     7          4.7
## 92  1998     8          4.5
## 93  1998     9          4.4
## 94  1998    10          4.2
## 95  1998    11          4.1
## 96  1998    12          4.0
## 97  1999     1          4.8
## 98  1999     2          4.7
## 99  1999     3          4.4
## 100 1999     4          4.1
## 101 1999     5          4.0
## 102 1999     6          4.5
## 103 1999     7          4.5
## 104 1999     8          4.2
## 105 1999     9          4.1
## 106 1999    10          3.8
## 107 1999    11          3.8
## 108 1999    12          3.7
## 109 2000     1          4.5
## 110 2000     2          4.4
## 111 2000     3          4.3
## 112 2000     4          3.7
## 113 2000     5          3.8
## 114 2000     6          4.1
## 115 2000     7          4.2
## 116 2000     8          4.1
## 117 2000     9          3.8
## 118 2000    10          3.6
## 119 2000    11          3.7
## 120 2000    12          3.7
## 121 2001     1          4.7
## 122 2001     2          4.6
## 123 2001     3          4.5
## 124 2001     4          4.2
## 125 2001     5          4.1
## 126 2001     6          4.7
## 127 2001     7          4.7
## 128 2001     8          4.9
## 129 2001     9          4.7
## 130 2001    10          5.0
## 131 2001    11          5.3
## 132 2001    12          5.4
## 133 2002     1          6.3
## 134 2002     2          6.1
## 135 2002     3          6.1
## 136 2002     4          5.7
## 137 2002     5          5.5
## 138 2002     6          6.0
## 139 2002     7          5.9
## 140 2002     8          5.7
## 141 2002     9          5.4
## 142 2002    10          5.3
## 143 2002    11          5.6
## 144 2002    12          5.7
## 145 2003     1          6.5
## 146 2003     2          6.4
## 147 2003     3          6.2
## 148 2003     4          5.8
## 149 2003     5          5.8
## 150 2003     6          6.5
## 151 2003     7          6.3
## 152 2003     8          6.0
## 153 2003     9          5.8
## 154 2003    10          5.6
## 155 2003    11          5.6
## 156 2003    12          5.4
## 157 2004     1          6.3
## 158 2004     2          6.0
## 159 2004     3          6.0
## 160 2004     4          5.4
## 161 2004     5          5.3
## 162 2004     6          5.8
## 163 2004     7          5.7
## 164 2004     8          5.4
## 165 2004     9          5.1
## 166 2004    10          5.1
## 167 2004    11          5.2
## 168 2004    12          5.1
## 169 2005     1          5.7
## 170 2005     2          5.8
## 171 2005     3          5.4
## 172 2005     4          4.9
## 173 2005     5          4.9
## 174 2005     6          5.2
## 175 2005     7          5.2
## 176 2005     8          4.9
## 177 2005     9          4.8
## 178 2005    10          4.6
## 179 2005    11          4.8
## 180 2005    12          4.6
## 181 2006     1          5.1
## 182 2006     2          5.1
## 183 2006     3          4.8
## 184 2006     4          4.5
## 185 2006     5          4.4
## 186 2006     6          4.8
## 187 2006     7          5.0
## 188 2006     8          4.6
## 189 2006     9          4.4
## 190 2006    10          4.1
## 191 2006    11          4.3
## 192 2006    12          4.3
## 193 2007     1          5.0
## 194 2007     2          4.9
## 195 2007     3          4.5
## 196 2007     4          4.3
## 197 2007     5          4.3
## 198 2007     6          4.7
## 199 2007     7          4.9
## 200 2007     8          4.6
## 201 2007     9          4.5
## 202 2007    10          4.4
## 203 2007    11          4.5
## 204 2007    12          4.8
## 205 2008     1          5.4
## 206 2008     2          5.2
## 207 2008     3          5.2
## 208 2008     4          4.8
## 209 2008     5          5.2
## 210 2008     6          5.7
## 211 2008     7          6.0
## 212 2008     8          6.1
## 213 2008     9          6.0
## 214 2008    10          6.1
## 215 2008    11          6.5
## 216 2008    12          7.1
## 217 2009     1          8.5
## 218 2009     2          8.9
## 219 2009     3          9.0
## 220 2009     4          8.6
## 221 2009     5          9.1
## 222 2009     6          9.7
## 223 2009     7          9.7
## 224 2009     8          9.6
## 225 2009     9          9.5
## 226 2009    10          9.5
## 227 2009    11          9.4
## 228 2009    12          9.7
## 229 2010     1         10.6
## 230 2010     2         10.4
## 231 2010     3         10.2
## 232 2010     4          9.5
## 233 2010     5          9.3
## 234 2010     6          9.6
## 235 2010     7          9.7
## 236 2010     8          9.5
## 237 2010     9          9.2
## 238 2010    10          9.0
## 239 2010    11          9.3
## 240 2010    12          9.1
## 241 2011     1          9.8
## 242 2011     2          9.5
## 243 2011     3          9.2
## 244 2011     4          8.7
## 245 2011     5          8.7
## 246 2011     6          9.3
## 247 2011     7          9.3
## 248 2011     8          9.1
## 249 2011     9          8.8
## 250 2011    10          8.5
## 251 2011    11          8.2
## 252 2011    12          8.3
## 253 2012     1          8.8
## 254 2012     2          8.7
## 255 2012     3          8.4
## 256 2012     4          7.7
## 257 2012     5          7.9
## 258 2012     6          8.4
## 259 2012     7          8.6
## 260 2012     8          8.2
## 261 2012     9          7.6
## 262 2012    10          7.5
## 263 2012    11          7.4
## 264 2012    12          7.6
## 265 2013     1          8.5
## 266 2013     2          8.1
## 267 2013     3          7.6
## 268 2013     4          7.1
## 269 2013     5          7.3
## 270 2013     6          7.8
## 271 2013     7          7.7
## 272 2013     8          7.3
## 273 2013     9          7.0
## 274 2013    10          7.0
## 275 2013    11          6.6
## 276 2013    12          6.5
## 277 2014     1          7.0
## 278 2014     2          7.0
## 279 2014     3          6.8
## 280 2014     4          5.9
## 281 2014     5          6.1
## 282 2014     6          6.3
## 283 2014     7          6.5
## 284 2014     8          6.3
## 285 2014     9          5.7
## 286 2014    10          5.5
## 287 2014    11          5.5
## 288 2014    12          5.4
## 289 2015     1          6.1
## 290 2015     2          5.8
## 291 2015     3          5.6
## 292 2015     4          5.1
## 293 2015     5          5.3
## 294 2015     6          5.5
## 295 2015     7          5.6
## 296 2015     8          5.2
## 297 2015     9          4.9
## 298 2015    10          4.8
## 299 2015    11          4.8
## 300 2015    12          4.8
## 301 2016     1          5.3
## 302 2016     2          5.2
## 303 2016     3          5.1
## 304 2016     4          4.7
## 305 2016     5          4.5
## 306 2016     6          5.1
## 307 2016     7          5.1
## 308 2016     8          5.0
## 309 2016     9          4.8
## 310 2016    10          4.7
## 311 2016    11          4.4
## 312 2016    12          4.5
## 313 2017     1          5.1
## 314 2017     2          4.9
## 315 2017     3          4.6
## 316 2017     4          4.1
## 317 2017     5          4.1
## 318 2017     6          4.5
## 319 2017     7          4.6
## 320 2017     8          4.5
## 321 2017     9          4.1
## 322 2017    10          3.9
## 323 2017    11          3.9
## 324 2017    12          3.9
## 325 2018     1          4.5
## 326 2018     2          4.4
## 327 2018     3          4.1
## 328 2018     4          3.7
## 329 2018     5          3.6
## 330 2018     6          4.2
## 331 2018     7          4.1
## 332 2018     8          3.9
## 333 2018     9          3.6
## 334 2018    10          3.5
## 335 2018    11          3.5
## 336 2018    12          3.7
## 337 2019     1          4.4
## 338 2019     2          4.1
## 339 2019     3          3.9
## 340 2019     4          3.3
## 341 2019     5          3.4
## 342 2019     6          3.8
## 343 2019     7          4.0
## 344 2019     8          3.8
## 345 2019     9          3.3
## 346 2019    10          3.3
## 347 2019    11          3.3
## 348 2019    12          3.4
## 349 2020     1          4.0
## 350 2020     2          3.8
## 351 2020     3          4.5
## 352 2020     4         14.4
## 353 2020     5         13.0
## 354 2020     6         11.2
## 355 2020     7         10.5
## 356 2020     8          8.5
## 357 2020     9          7.7
## 358 2020    10          6.6
## 359 2020    11          6.4
## 360 2020    12          6.5
str(cpi)
## 'data.frame':    382 obs. of  5 variables:
##  $ Series.ID: chr  "CUUR0000SA0" "CUUR0000SA0" "CUUR0000SA0" "CUUR0000SA0" ...
##  $ Year     : int  1991 1991 1991 1991 1991 1991 1991 1991 1991 1991 ...
##  $ Period   : chr  "M01" "M02" "M03" "M04" ...
##  $ Label    : chr  "1991 Jan" "1991 Feb" "1991 Mar" "1991 Apr" ...
##  $ Value    : num  135 135 135 135 136 ...
cpiDF=data.frame("year"=df$year,"month"=df_month$month,"cpi"=cpi["Value"][1:360,])
cpiDF
##     year month     cpi
## 1   1991     1 134.600
## 2   1991     2 134.800
## 3   1991     3 135.000
## 4   1991     4 135.200
## 5   1991     5 135.600
## 6   1991     6 136.000
## 7   1991     7 136.200
## 8   1991     8 136.600
## 9   1991     9 137.200
## 10  1991    10 137.400
## 11  1991    11 137.800
## 12  1991    12 137.900
## 13  1992     1 138.100
## 14  1992     2 138.600
## 15  1992     3 139.300
## 16  1992     4 139.500
## 17  1992     5 139.700
## 18  1992     6 140.200
## 19  1992     7 140.500
## 20  1992     8 140.900
## 21  1992     9 141.300
## 22  1992    10 141.800
## 23  1992    11 142.000
## 24  1992    12 141.900
## 25  1993     1 142.600
## 26  1993     2 143.100
## 27  1993     3 143.600
## 28  1993     4 144.000
## 29  1993     5 144.200
## 30  1993     6 144.400
## 31  1993     7 144.400
## 32  1993     8 144.800
## 33  1993     9 145.100
## 34  1993    10 145.700
## 35  1993    11 145.800
## 36  1993    12 145.800
## 37  1994     1 146.200
## 38  1994     2 146.700
## 39  1994     3 147.200
## 40  1994     4 147.400
## 41  1994     5 147.500
## 42  1994     6 148.000
## 43  1994     7 148.400
## 44  1994     8 149.000
## 45  1994     9 149.400
## 46  1994    10 149.500
## 47  1994    11 149.700
## 48  1994    12 149.700
## 49  1995     1 150.300
## 50  1995     2 150.900
## 51  1995     3 151.400
## 52  1995     4 151.900
## 53  1995     5 152.200
## 54  1995     6 152.500
## 55  1995     7 152.500
## 56  1995     8 152.900
## 57  1995     9 153.200
## 58  1995    10 153.700
## 59  1995    11 153.600
## 60  1995    12 153.500
## 61  1996     1 154.400
## 62  1996     2 154.900
## 63  1996     3 155.700
## 64  1996     4 156.300
## 65  1996     5 156.600
## 66  1996     6 156.700
## 67  1996     7 157.000
## 68  1996     8 157.300
## 69  1996     9 157.800
## 70  1996    10 158.300
## 71  1996    11 158.600
## 72  1996    12 158.600
## 73  1997     1 159.100
## 74  1997     2 159.600
## 75  1997     3 160.000
## 76  1997     4 160.200
## 77  1997     5 160.100
## 78  1997     6 160.300
## 79  1997     7 160.500
## 80  1997     8 160.800
## 81  1997     9 161.200
## 82  1997    10 161.600
## 83  1997    11 161.500
## 84  1997    12 161.300
## 85  1998     1 161.600
## 86  1998     2 161.900
## 87  1998     3 162.200
## 88  1998     4 162.500
## 89  1998     5 162.800
## 90  1998     6 163.000
## 91  1998     7 163.200
## 92  1998     8 163.400
## 93  1998     9 163.600
## 94  1998    10 164.000
## 95  1998    11 164.000
## 96  1998    12 163.900
## 97  1999     1 164.300
## 98  1999     2 164.500
## 99  1999     3 165.000
## 100 1999     4 166.200
## 101 1999     5 166.200
## 102 1999     6 166.200
## 103 1999     7 166.700
## 104 1999     8 167.100
## 105 1999     9 167.900
## 106 1999    10 168.200
## 107 1999    11 168.300
## 108 1999    12 168.300
## 109 2000     1 168.800
## 110 2000     2 169.800
## 111 2000     3 171.200
## 112 2000     4 171.300
## 113 2000     5 171.500
## 114 2000     6 172.400
## 115 2000     7 172.800
## 116 2000     8 172.800
## 117 2000     9 173.700
## 118 2000    10 174.000
## 119 2000    11 174.100
## 120 2000    12 174.000
## 121 2001     1 175.100
## 122 2001     2 175.800
## 123 2001     3 176.200
## 124 2001     4 176.900
## 125 2001     5 177.700
## 126 2001     6 178.000
## 127 2001     7 177.500
## 128 2001     8 177.500
## 129 2001     9 178.300
## 130 2001    10 177.700
## 131 2001    11 177.400
## 132 2001    12 176.700
## 133 2002     1 177.100
## 134 2002     2 177.800
## 135 2002     3 178.800
## 136 2002     4 179.800
## 137 2002     5 179.800
## 138 2002     6 179.900
## 139 2002     7 180.100
## 140 2002     8 180.700
## 141 2002     9 181.000
## 142 2002    10 181.300
## 143 2002    11 181.300
## 144 2002    12 180.900
## 145 2003     1 181.700
## 146 2003     2 183.100
## 147 2003     3 184.200
## 148 2003     4 183.800
## 149 2003     5 183.500
## 150 2003     6 183.700
## 151 2003     7 183.900
## 152 2003     8 184.600
## 153 2003     9 185.200
## 154 2003    10 185.000
## 155 2003    11 184.500
## 156 2003    12 184.300
## 157 2004     1 185.200
## 158 2004     2 186.200
## 159 2004     3 187.400
## 160 2004     4 188.000
## 161 2004     5 189.100
## 162 2004     6 189.700
## 163 2004     7 189.400
## 164 2004     8 189.500
## 165 2004     9 189.900
## 166 2004    10 190.900
## 167 2004    11 191.000
## 168 2004    12 190.300
## 169 2005     1 190.700
## 170 2005     2 191.800
## 171 2005     3 193.300
## 172 2005     4 194.600
## 173 2005     5 194.400
## 174 2005     6 194.500
## 175 2005     7 195.400
## 176 2005     8 196.400
## 177 2005     9 198.800
## 178 2005    10 199.200
## 179 2005    11 197.600
## 180 2005    12 196.800
## 181 2006     1 198.300
## 182 2006     2 198.700
## 183 2006     3 199.800
## 184 2006     4 201.500
## 185 2006     5 202.500
## 186 2006     6 202.900
## 187 2006     7 203.500
## 188 2006     8 203.900
## 189 2006     9 202.900
## 190 2006    10 201.800
## 191 2006    11 201.500
## 192 2006    12 201.800
## 193 2007     1 202.416
## 194 2007     2 203.499
## 195 2007     3 205.352
## 196 2007     4 206.686
## 197 2007     5 207.949
## 198 2007     6 208.352
## 199 2007     7 208.299
## 200 2007     8 207.917
## 201 2007     9 208.490
## 202 2007    10 208.936
## 203 2007    11 210.177
## 204 2007    12 210.036
## 205 2008     1 211.080
## 206 2008     2 211.693
## 207 2008     3 213.528
## 208 2008     4 214.823
## 209 2008     5 216.632
## 210 2008     6 218.815
## 211 2008     7 219.964
## 212 2008     8 219.086
## 213 2008     9 218.783
## 214 2008    10 216.573
## 215 2008    11 212.425
## 216 2008    12 210.228
## 217 2009     1 211.143
## 218 2009     2 212.193
## 219 2009     3 212.709
## 220 2009     4 213.240
## 221 2009     5 213.856
## 222 2009     6 215.693
## 223 2009     7 215.351
## 224 2009     8 215.834
## 225 2009     9 215.969
## 226 2009    10 216.177
## 227 2009    11 216.330
## 228 2009    12 215.949
## 229 2010     1 216.687
## 230 2010     2 216.741
## 231 2010     3 217.631
## 232 2010     4 218.009
## 233 2010     5 218.178
## 234 2010     6 217.965
## 235 2010     7 218.011
## 236 2010     8 218.312
## 237 2010     9 218.439
## 238 2010    10 218.711
## 239 2010    11 218.803
## 240 2010    12 219.179
## 241 2011     1 220.223
## 242 2011     2 221.309
## 243 2011     3 223.467
## 244 2011     4 224.906
## 245 2011     5 225.964
## 246 2011     6 225.722
## 247 2011     7 225.922
## 248 2011     8 226.545
## 249 2011     9 226.889
## 250 2011    10 226.421
## 251 2011    11 226.230
## 252 2011    12 225.672
## 253 2012     1 226.665
## 254 2012     2 227.663
## 255 2012     3 229.392
## 256 2012     4 230.085
## 257 2012     5 229.815
## 258 2012     6 229.478
## 259 2012     7 229.104
## 260 2012     8 230.379
## 261 2012     9 231.407
## 262 2012    10 231.317
## 263 2012    11 230.221
## 264 2012    12 229.601
## 265 2013     1 230.280
## 266 2013     2 232.166
## 267 2013     3 232.773
## 268 2013     4 232.531
## 269 2013     5 232.945
## 270 2013     6 233.504
## 271 2013     7 233.596
## 272 2013     8 233.877
## 273 2013     9 234.149
## 274 2013    10 233.546
## 275 2013    11 233.069
## 276 2013    12 233.049
## 277 2014     1 233.916
## 278 2014     2 234.781
## 279 2014     3 236.293
## 280 2014     4 237.072
## 281 2014     5 237.900
## 282 2014     6 238.343
## 283 2014     7 238.250
## 284 2014     8 237.852
## 285 2014     9 238.031
## 286 2014    10 237.433
## 287 2014    11 236.151
## 288 2014    12 234.812
## 289 2015     1 233.707
## 290 2015     2 234.722
## 291 2015     3 236.119
## 292 2015     4 236.599
## 293 2015     5 237.805
## 294 2015     6 238.638
## 295 2015     7 238.654
## 296 2015     8 238.316
## 297 2015     9 237.945
## 298 2015    10 237.838
## 299 2015    11 237.336
## 300 2015    12 236.525
## 301 2016     1 236.916
## 302 2016     2 237.111
## 303 2016     3 238.132
## 304 2016     4 239.261
## 305 2016     5 240.229
## 306 2016     6 241.018
## 307 2016     7 240.628
## 308 2016     8 240.849
## 309 2016     9 241.428
## 310 2016    10 241.729
## 311 2016    11 241.353
## 312 2016    12 241.432
## 313 2017     1 242.839
## 314 2017     2 243.603
## 315 2017     3 243.801
## 316 2017     4 244.524
## 317 2017     5 244.733
## 318 2017     6 244.955
## 319 2017     7 244.786
## 320 2017     8 245.519
## 321 2017     9 246.819
## 322 2017    10 246.663
## 323 2017    11 246.669
## 324 2017    12 246.524
## 325 2018     1 247.867
## 326 2018     2 248.991
## 327 2018     3 249.554
## 328 2018     4 250.546
## 329 2018     5 251.588
## 330 2018     6 251.989
## 331 2018     7 252.006
## 332 2018     8 252.146
## 333 2018     9 252.439
## 334 2018    10 252.885
## 335 2018    11 252.038
## 336 2018    12 251.233
## 337 2019     1 251.712
## 338 2019     2 252.776
## 339 2019     3 254.202
## 340 2019     4 255.548
## 341 2019     5 256.092
## 342 2019     6 256.143
## 343 2019     7 256.571
## 344 2019     8 256.558
## 345 2019     9 256.759
## 346 2019    10 257.346
## 347 2019    11 257.208
## 348 2019    12 256.974
## 349 2020     1 257.971
## 350 2020     2 258.678
## 351 2020     3 258.115
## 352 2020     4 256.389
## 353 2020     5 256.394
## 354 2020     6 257.797
## 355 2020     7 259.101
## 356 2020     8 259.918
## 357 2020     9 260.280
## 358 2020    10 260.388
## 359 2020    11 260.229
## 360 2020    12 260.474
str(non_rHC)
## 'data.frame':    360 obs. of  3 variables:
##  $ year      : int  1991 1991 1991 1991 1991 1991 1991 1991 1991 1991 ...
##  $ month     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ hate_crime: int  102 81 69 76 90 100 111 152 144 150 ...
non_rHC
##     year month hate_crime
## 1   1991     1        102
## 2   1991     2         81
## 3   1991     3         69
## 4   1991     4         76
## 5   1991     5         90
## 6   1991     6        100
## 7   1991     7        111
## 8   1991     8        152
## 9   1991     9        144
## 10  1991    10        150
## 11  1991    11        146
## 12  1991    12         97
## 13  1992     1        123
## 14  1992     2        149
## 15  1992     3        202
## 16  1992     4        199
## 17  1992     5        178
## 18  1992     6        155
## 19  1992     7        127
## 20  1992     8        173
## 21  1992     9        113
## 22  1992    10        179
## 23  1992    11        200
## 24  1992    12        140
## 25  1993     1        162
## 26  1993     2        176
## 27  1993     3        140
## 28  1993     4        198
## 29  1993     5        180
## 30  1993     6        196
## 31  1993     7        185
## 32  1993     8        187
## 33  1993     9        177
## 34  1993    10        206
## 35  1993    11        205
## 36  1993    12        144
## 37  1994     1        108
## 38  1994     2        153
## 39  1994     3        218
## 40  1994     4        180
## 41  1994     5        115
## 42  1994     6        140
## 43  1994     7        133
## 44  1994     8        150
## 45  1994     9        147
## 46  1994    10        163
## 47  1994    11        121
## 48  1994    12        126
## 49  1995     1        202
## 50  1995     2        162
## 51  1995     3        183
## 52  1995     4        235
## 53  1995     5        164
## 54  1995     6        204
## 55  1995     7        217
## 56  1995     8        171
## 57  1995     9        196
## 58  1995    10        216
## 59  1995    11        184
## 60  1995    12        162
## 61  1996     1        174
## 62  1996     2        185
## 63  1996     3        200
## 64  1996     4        245
## 65  1996     5        207
## 66  1996     6        202
## 67  1996     7        204
## 68  1996     8        210
## 69  1996     9        215
## 70  1996    10        234
## 71  1996    11        183
## 72  1996    12        186
## 73  1997     1        151
## 74  1997     2        198
## 75  1997     3        241
## 76  1997     4        220
## 77  1997     5        238
## 78  1997     6        225
## 79  1997     7        224
## 80  1997     8        185
## 81  1997     9        223
## 82  1997    10        247
## 83  1997    11        186
## 84  1997    12        174
## 85  1998     1        218
## 86  1998     2        206
## 87  1998     3        207
## 88  1998     4        244
## 89  1998     5        243
## 90  1998     6        210
## 91  1998     7        216
## 92  1998     8        232
## 93  1998     9        212
## 94  1998    10        312
## 95  1998    11        212
## 96  1998    12        207
## 97  1999     1        193
## 98  1999     2        233
## 99  1999     3        219
## 100 1999     4        262
## 101 1999     5        235
## 102 1999     6        201
## 103 1999     7        256
## 104 1999     8        262
## 105 1999     9        264
## 106 1999    10        227
## 107 1999    11        210
## 108 1999    12        197
## 109 2000     1        193
## 110 2000     2        216
## 111 2000     3        248
## 112 2000     4        250
## 113 2000     5        226
## 114 2000     6        235
## 115 2000     7        220
## 116 2000     8        212
## 117 2000     9        243
## 118 2000    10        398
## 119 2000    11        234
## 120 2000    12        186
## 121 2001     1        216
## 122 2001     2        195
## 123 2001     3        222
## 124 2001     4        273
## 125 2001     5        244
## 126 2001     6        266
## 127 2001     7        222
## 128 2001     8        255
## 129 2001     9        636
## 130 2001    10        366
## 131 2001    11        210
## 132 2001    12        193
## 133 2002     1        179
## 134 2002     2        186
## 135 2002     3        263
## 136 2002     4        290
## 137 2002     5        237
## 138 2002     6        248
## 139 2002     7        241
## 140 2002     8        186
## 141 2002     9        297
## 142 2002    10        230
## 143 2002    11        201
## 144 2002    12        162
## 145 2003     1        182
## 146 2003     2        194
## 147 2003     3        227
## 148 2003     4        238
## 149 2003     5        255
## 150 2003     6        218
## 151 2003     7        216
## 152 2003     8        206
## 153 2003     9        244
## 154 2003    10        258
## 155 2003    11        232
## 156 2003    12        168
## 157 2004     1        195
## 158 2004     2        229
## 159 2004     3        213
## 160 2004     4        264
## 161 2004     5        231
## 162 2004     6        236
## 163 2004     7        208
## 164 2004     8        208
## 165 2004     9        240
## 166 2004    10        232
## 167 2004    11        199
## 168 2004    12        173
## 169 2005     1        177
## 170 2005     2        189
## 171 2005     3        229
## 172 2005     4        250
## 173 2005     5        256
## 174 2005     6        202
## 175 2005     7        197
## 176 2005     8        199
## 177 2005     9        202
## 178 2005    10        230
## 179 2005    11        183
## 180 2005    12        158
## 181 2006     1        194
## 182 2006     2        187
## 183 2006     3        232
## 184 2006     4        241
## 185 2006     5        220
## 186 2006     6        247
## 187 2006     7        273
## 188 2006     8        243
## 189 2006     9        245
## 190 2006    10        241
## 191 2006    11        215
## 192 2006    12        202
## 193 2007     1        200
## 194 2007     2        162
## 195 2007     3        243
## 196 2007     4        252
## 197 2007     5        251
## 198 2007     6        219
## 199 2007     7        219
## 200 2007     8        243
## 201 2007     9        287
## 202 2007    10        290
## 203 2007    11        199
## 204 2007    12        167
## 205 2008     1        209
## 206 2008     2        229
## 207 2008     3        270
## 208 2008     4        250
## 209 2008     5        272
## 210 2008     6        262
## 211 2008     7        251
## 212 2008     8        257
## 213 2008     9        242
## 214 2008    10        277
## 215 2008    11        275
## 216 2008    12        187
## 217 2009     1        196
## 218 2009     2        188
## 219 2009     3        223
## 220 2009     4        232
## 221 2009     5        247
## 222 2009     6        220
## 223 2009     7        204
## 224 2009     8        243
## 225 2009     9        215
## 226 2009    10        250
## 227 2009    11        225
## 228 2009    12        176
## 229 2010     1        195
## 230 2010     2        179
## 231 2010     3        229
## 232 2010     4        263
## 233 2010     5        248
## 234 2010     6        202
## 235 2010     7        193
## 236 2010     8        251
## 237 2010     9        270
## 238 2010    10        263
## 239 2010    11        201
## 240 2010    12        151
## 241 2011     1        168
## 242 2011     2        148
## 243 2011     3        200
## 244 2011     4        225
## 245 2011     5        242
## 246 2011     6        235
## 247 2011     7        233
## 248 2011     8        235
## 249 2011     9        220
## 250 2011    10        256
## 251 2011    11        200
## 252 2011    12        243
## 253 2012     1        266
## 254 2012     2        201
## 255 2012     3        234
## 256 2012     4        240
## 257 2012     5        233
## 258 2012     6        239
## 259 2012     7        259
## 260 2012     8        223
## 261 2012     9        253
## 262 2012    10        210
## 263 2012    11        218
## 264 2012    12        155
## 265 2013     1        164
## 266 2013     2        158
## 267 2013     3        209
## 268 2013     4        217
## 269 2013     5        235
## 270 2013     6        246
## 271 2013     7        218
## 272 2013     8        220
## 273 2013     9        204
## 274 2013    10        223
## 275 2013    11        186
## 276 2013    12        179
## 277 2014     1        120
## 278 2014     2        151
## 279 2014     3        202
## 280 2014     4        206
## 281 2014     5        201
## 282 2014     6        227
## 283 2014     7        215
## 284 2014     8        240
## 285 2014     9        212
## 286 2014    10        240
## 287 2014    11        153
## 288 2014    12        139
## 289 2015     1        180
## 290 2015     2        153
## 291 2015     3        183
## 292 2015     4        228
## 293 2015     5        231
## 294 2015     6        240
## 295 2015     7        254
## 296 2015     8        223
## 297 2015     9        189
## 298 2015    10        214
## 299 2015    11        195
## 300 2015    12        246
## 301 2016     1        163
## 302 2016     2        170
## 303 2016     3        222
## 304 2016     4        228
## 305 2016     5        220
## 306 2016     6        255
## 307 2016     7        243
## 308 2016     8        197
## 309 2016     9        195
## 310 2016    10        228
## 311 2016    11        322
## 312 2016    12        226
## 313 2017     1        278
## 314 2017     2        267
## 315 2017     3        303
## 316 2017     4        233
## 317 2017     5        272
## 318 2017     6        259
## 319 2017     7        258
## 320 2017     8        271
## 321 2017     9        274
## 322 2017    10        256
## 323 2017    11        227
## 324 2017    12        195
## 325 2018     1        213
## 326 2018     2        245
## 327 2018     3        235
## 328 2018     4        212
## 329 2018     5        287
## 330 2018     6        274
## 331 2018     7        230
## 332 2018     8        265
## 333 2018     9        266
## 334 2018    10        315
## 335 2018    11        266
## 336 2018    12        245
## 337 2019     1        224
## 338 2019     2        187
## 339 2019     3        294
## 340 2019     4        264
## 341 2019     5        312
## 342 2019     6        329
## 343 2019     7        303
## 344 2019     8        287
## 345 2019     9        303
## 346 2019    10        314
## 347 2019    11        244
## 348 2019    12        282
## 349 2020     1        310
## 350 2020     2        295
## 351 2020     3        245
## 352 2020     4        196
## 353 2020     5        247
## 354 2020     6        301
## 355 2020     7        313
## 356 2020     8        350
## 357 2020     9        301
## 358 2020    10        352
## 359 2020    11        291
## 360 2020    12        269

0. Descriptive Analysis

### Get time series plots for all three data and summary statistics
#ts1 <- xts(unempDF$unemploymente, as.POSIXct(sprintf("%d-%d-01", unempDF$year, unempDF$month)))
ts1 <- xts(unempDF$unemployment, as.yearmon(unempDF$year + (unempDF$month-1)/12))
plot(ts1, main="unemployment")

ts2 <- xts(cpiDF$cpi, as.yearmon(cpiDF$year + (cpiDF$month-1)/12))
plot(ts2, main="cpi")

ts3 <- xts(temp, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts3, main="average temperature")

ts4 <- xts(allHC$hate_crime, as.yearmon(allHC$year + (allHC$month-1)/12))
plot(ts4, main="all racial hate crime")

ts5 <- xts(blackHC$hate_crime, as.yearmon(blackHC$year + (blackHC$month-1)/12))
plot(ts5, main="anti-black hate crime")

ts6 <- xts(asianHC$hate_crime, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts6, main="anti-black hate crime")

tsplot(diff(unempDF$unemployment))

t=1:360
fit=lm(cpi ~ t, data=cpiDF)
summary(fit)
## 
## Call:
## lm(formula = cpi ~ t, data = cpiDF)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.662 -2.094  0.005  1.262 11.118 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.326e+02  2.721e-01   487.2   <2e-16 ***
## t           3.615e-01  1.306e-03   276.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.576 on 358 degrees of freedom
## Multiple R-squared:  0.9953, Adjusted R-squared:  0.9953 
## F-statistic: 7.658e+04 on 1 and 358 DF,  p-value: < 2.2e-16
detrend_cpi=fit$residuals
tsplot(detrend_cpi)

acf(detrend_cpi)

tsplot(diff(detrend_cpi))

acf(diff(detrend_cpi))

acf(unempDF$unemployment)

tsplot(diff(unempDF$unemployment))

acf(diff(unempDF$unemployment))

tsplot(allHC$hate_crime)

acf(allHC$hate_crime)

tsplot(cpiDF$cpi)

#detrend cpi
t=1:360
fit=lm(cpi ~ t, data=cpiDF)
summary(fit)
## 
## Call:
## lm(formula = cpi ~ t, data = cpiDF)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.662 -2.094  0.005  1.262 11.118 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.326e+02  2.721e-01   487.2   <2e-16 ***
## t           3.615e-01  1.306e-03   276.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.576 on 358 degrees of freedom
## Multiple R-squared:  0.9953, Adjusted R-squared:  0.9953 
## F-statistic: 7.658e+04 on 1 and 358 DF,  p-value: < 2.2e-16
detrend_cpi=fit$residuals
tsplot(detrend_cpi)

acf(detrend_cpi)

tsplot(diff(detrend_cpi))

acf(diff(detrend_cpi))

tsplot(non_rHC$hate_crime)

acf(non_rHC$hate_crime)

new_merged=as.zoo(ts.intersect(cpi=ts(cpiDF$cpi),cpi_notrend=detrend_cpi, unemp=ts(unempDF$unemployment), temp, non_rhc= ts(non_rHC$hate_crime), hc=ts(allHC$hate_crime), blackhc=ts(blackHC$hate_crime), asianhc=ts(asianHC$hate_crime), time=ts(1:360)))

plot(log(cpiDF$cpi), log(allHC$hate_crime))

plot(log(unempDF$unemployment), log(allHC$hate_crime))

plot(cpiDF$cpi[229:360], blackHC$hate_crime[229:360], )

plot(blackHC$hate_crime[229:360], unempDF$unemployment[229:360])

plot(asianHC$hate_crime[229:360], cpiDF$cpi[229:360])

plot(asianHC$hate_crime[229:360], unempDF$unemployment[229:360])

y=asianHC$hate_crime[229:360]
x=cpiDF$cpi[229:360]
plot(x,y)
abline(lm(y ~ x), col = "orange", lwd = 3)

panel.cor <- function(x, y, ...){
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- round(cor(x, y), 2)
    text(0.5, 0.5, r, cex = 1.75)
}
pairs(cbind(Hate_Crime=new_merged$hc,Black_HC=new_merged$blackhc, Asian_HC=new_merged$asianhc,CPI=new_merged$cpi,detrend_CPI=new_merged$cpi_notrend, Unemployment=new_merged$unemp,Temp=new_merged$temp, Trend=new_merged$time, non_rHC=new_merged$non_rhc),
           col="lightcoral", lower.panel=panel.cor)

pairs(cbind(Hate_Crime=new_merged$hc[1:348],Black_HC=new_merged$blackhc[1:348], Asian_HC=new_merged$asianhc[1:348],CPI=new_merged$cpi[1:348],detrend_CPI=new_merged$cpi_notrend, Unemployment=new_merged$unemp[1:348],Temp=new_merged$temp[1:348], Trend=new_merged$time),
           col="dodgerblue3", lower.panel=panel.cor)

1. Spectral Analysis

#Function that I created in Lab7 
testfunction<-function(x_t, n){
  myspec <- mvspec(x_t, log="no")
  specL=order(myspec$spec, decreasing=TRUE)[1:n]
  omegaL=c()
  for (i in 1:n){
    omegaL[i]=myspec$freq[specL[i]]
  }
  #print(omegaL)
  t=time(x_t)
  df <- data.frame(matrix(ncol = n*2, nrow = length(x_t)))
  colL=c()
  for (j in 1:n){
    Xcos <- paste0("Xcos",j)
    colL[2*j-1]=Xcos
    Xsin <- paste0("Xsin",j)
    colL[2*j]=Xsin
    print(omegaL[j])
    df[2*j-1] = cos(2*pi*t*omegaL[j])
    df[2*j] = sin(2*pi*t*omegaL[j])
    #df=ts.intersect(df,Xcos, Xsin)
    
  }
  colnames(df) <-colL
  
  #create model using new data
  mod <- lm(x_t ~ ., data=df)
  
  #return(df)
  return(mod)
}
#assuming R^2 meaning adjusted R^2?
testfunction2<-function(x_t, r_squared){
  myspec <- mvspec(x_t, log="no")
  comparison_r=0
  n=0
  while (comparison_r <= r_squared){
    n=n+1
    Mod=testfunction(x_t,n)
    comparison_r=summary(Mod)$adj.r.squared
  }
  
  return(Mod)
}

all racial hate crime

#all racial hate crime
n = length(allHC$hate_crime)
myspec <- mvspec(allHC$hate_crime, log = "no") #there are several noisy spikes at front since we did not detrend the data

#from midsemester project, detrending using lm was not satisfying. Thus, use non-parametric approach
allhcTS <- ts(allHC$hate_crime, start=c(1991, 1), end=c(2020, 12), frequency=12)
allHC_decomp = decompose(allhcTS)
plot(allHC_decomp)

detrended = na.omit(allHC_decomp$seasonal+allHC_decomp$random)
tsplot(detrended)

hc_spec =mvspec(detrended)

mvspec(allhcTS)

which(myspec$spec > 300000) #30
## [1] 30
omega1 = myspec$freq[30] #0.08333333
t = new_merged$time
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
mod_spec1 <- lm(new_merged$hc~Xcos+Xsin)
summary(mod_spec1) #Adjusted R-squared:  0.1503 
## 
## Call:
## lm(formula = new_merged$hc ~ Xcos + Xsin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -182.59  -71.94   -9.86   49.94  909.07 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  389.061      5.475  71.066  < 2e-16 ***
## Xcos         -54.528      7.742  -7.043 9.74e-12 ***
## Xsin         -30.868      7.742  -3.987 8.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.9 on 357 degrees of freedom
## Multiple R-squared:  0.155,  Adjusted R-squared:  0.1503 
## F-statistic: 32.75 on 2 and 357 DF,  p-value: 8.745e-14
AIC(mod_spec1)
## [1] 4369.714
plot(mod_spec1, which = 1)

tsplot(new_merged$hc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(ts(mod_spec1$fitted.values,start = 0),col="red")

mod_spec2=testfunction(new_merged$hc, 2) #two sets of cos, sin

## [1] 0.08333333
## [1] 0.002777778
#using the printed value, I found omega1=0.08333333, omega2=0.002777778
summary(mod_spec2) #0.4004 
## 
## Call:
## lm(formula = x_t ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -177.88  -42.17   -7.34   28.29  839.61 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  389.061      4.599  84.602  < 2e-16 ***
## Xcos1        -54.528      6.504  -8.384 1.21e-15 ***
## Xsin1        -30.868      6.504  -4.746 3.01e-06 ***
## Xcos2        -13.026      6.504  -2.003    0.046 *  
## Xsin2         78.835      6.504  12.122  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 87.25 on 355 degrees of freedom
## Multiple R-squared:  0.4071, Adjusted R-squared:  0.4004 
## F-statistic: 60.94 on 4 and 355 DF,  p-value: < 2.2e-16
plot(mod_spec2, which=1)

#cycles are:
firstC=1/0.08333333
firstC
## [1] 12
secondC=1/0.002777778
secondC #is 360, meaningless
## [1] 360
#fit the model with high adj r-squared
a=testfunction2(new_merged$hc, 0.5) #6 set of Cos, Sin. Total 13 variables including intercept

## [1] 0.08333333
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.02777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.02777778
## [1] 0.01388889

## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.02777778
## [1] 0.01388889
## [1] 0.1666667
summary(a) 
## 
## Call:
## lm(formula = x_t ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -163.95  -39.98   -3.69   28.78  756.53 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  389.061      4.195  92.733  < 2e-16 ***
## Xcos1        -54.528      5.933  -9.190  < 2e-16 ***
## Xsin1        -30.868      5.933  -5.203 3.37e-07 ***
## Xcos2        -13.026      5.933  -2.195 0.028802 *  
## Xsin2         78.835      5.933  13.287  < 2e-16 ***
## Xcos3         14.096      5.933   2.376 0.018059 *  
## Xsin3        -23.784      5.933  -4.009 7.48e-05 ***
## Xcos4         -1.242      5.933  -0.209 0.834299    
## Xsin4        -26.819      5.933  -4.520 8.49e-06 ***
## Xcos5          9.110      5.933   1.535 0.125610    
## Xsin5        -20.601      5.933  -3.472 0.000582 ***
## Xcos6        -24.650      5.933  -4.155 4.11e-05 ***
## Xsin6        -14.116      5.933  -2.379 0.017893 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.6 on 347 degrees of freedom
## Multiple R-squared:  0.5177, Adjusted R-squared:  0.501 
## F-statistic: 31.03 on 12 and 347 DF,  p-value: < 2.2e-16
summary(a)$adj.r.squared #0.5009767
## [1] 0.5009767
plot(a,which=1) 

tsplot(new_merged$hc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): 'x' is NULL so the result
## will be NULL
lines(ts(a$fitted.values,start = 0),col="red")

#which(hc_spec$spec > 5000) #30, 60
#omega2 = hc_spec$freq[30] #0.08333333
#t = time(allhcTS)
#Xcos = cos(2*pi*t*omega2)
#Xsin = sin(2*pi*t*omega2)
#mod_spec3 <- lm(allhcTS~Xcos+Xsin)
#summary(mod_spec3) #Adjusted R-squared:  0.1503 
#plot(mod_spec3, which = 1)
#tsplot(allhcTS)
#lines(ts(mod_spec3$fitted.values,start = 0),col="red")
#acf(mod_spec3$residuals)

anti-Black or African American hate crime

#black hate crime
n = length(blackHC$hate_crime)
myspec2 <- mvspec(blackHC$hate_crime, log = "no") #there are several noisy spikes at front since we did not detrend the data

#from midsemester project, detrending using lm was not satisfying. Thus, use non-parametric approach
blackhcTS <- ts(blackHC$hate_crime, start=c(1991, 1), end=c(2020, 12), frequency=12)
blackHC_decomp = decompose(blackhcTS)
plot(blackHC_decomp)

detrended2 = na.omit(blackHC_decomp$seasonal+blackHC_decomp$random)
tsplot(detrended2)

blackhc_spec =mvspec(detrended2)

mvspec(blackhcTS)

which(myspec2$spec > 80000) #30
## [1] 30
omega2 = myspec2$freq[30] #0.08333333
t = new_merged$time
Xcos = cos(2*pi*t*omega2)
Xsin = sin(2*pi*t*omega2)
mod_spec2 <- lm(new_merged$blackhc~Xcos+Xsin)
summary(mod_spec2) #Adjusted R-squared:  0.1426 
## 
## Call:
## lm(formula = new_merged$blackhc ~ Xcos + Xsin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -122.39  -37.79   -4.44   28.84  455.61 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  207.672      2.993  69.386  < 2e-16 ***
## Xcos         -29.715      4.233  -7.020 1.12e-11 ***
## Xsin         -14.912      4.233  -3.523 0.000482 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.79 on 357 degrees of freedom
## Multiple R-squared:  0.1474, Adjusted R-squared:  0.1426 
## F-statistic: 30.85 on 2 and 357 DF,  p-value: 4.387e-13
AIC(mod_spec2)
## [1] 3934.939
BIC(mod_spec2)
## [1] 3950.483
plot(mod_spec2, which = 1)

tsplot(new_merged$blackhc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(ts(mod_spec2$fitted.values,start = 0),col="red")

mod_spec3=testfunction(new_merged$blackhc, 2) #two sets of cos, sin

## [1] 0.08333333
## [1] 0.002777778
#using the printed value, I found omega1=0.08333333, omega2=0.002777778
summary(mod_spec3) #0.4177 
## 
## Call:
## lm(formula = x_t ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -114.59  -25.72   -3.55   16.85  472.57 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  207.672      2.467  84.194  < 2e-16 ***
## Xcos1        -29.715      3.488  -8.519 4.66e-16 ***
## Xsin1        -14.912      3.488  -4.275 2.46e-05 ***
## Xcos2        -12.446      3.488  -3.568 0.000409 ***
## Xsin2         43.834      3.488  12.566  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.8 on 355 degrees of freedom
## Multiple R-squared:  0.4241, Adjusted R-squared:  0.4177 
## F-statistic: 65.37 on 4 and 355 DF,  p-value: < 2.2e-16
plot(mod_spec3, which=1)

#cycles are:
firstC=1/0.08333333
firstC
## [1] 12
secondC=1/0.002777778
secondC #is 360, meaningless
## [1] 360
#fit the model with high adj r-squared
b=testfunction2(new_merged$blackhc, 0.5) #5 set of Cos, Sin. Total 13 variables including intercept

## [1] 0.08333333
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.008333333
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.008333333
## [1] 0.02777778

## [1] 0.08333333
## [1] 0.002777778
## [1] 0.008333333
## [1] 0.02777778
## [1] 0.03055556
summary(b) 
## 
## Call:
## lm(formula = x_t ~ ., data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -95.53 -25.06  -4.62  19.57 438.86 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 207.67222    2.27734  91.191  < 2e-16 ***
## Xcos1       -29.71521    3.22065  -9.226  < 2e-16 ***
## Xsin1       -14.91212    3.22065  -4.630 5.16e-06 ***
## Xcos2       -12.44581    3.22065  -3.864 0.000133 ***
## Xsin2        43.83440    3.22065  13.610  < 2e-16 ***
## Xcos3        -0.05592    3.22065  -0.017 0.986157    
## Xsin3       -16.43700    3.22065  -5.104 5.49e-07 ***
## Xcos4         3.97823    3.22065   1.235 0.217577    
## Xsin4       -14.91454    3.22065  -4.631 5.14e-06 ***
## Xcos5         6.68679    3.22065   2.076 0.038605 *  
## Xsin5       -12.10298    3.22065  -3.758 0.000201 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.21 on 349 degrees of freedom
## Multiple R-squared:  0.5174, Adjusted R-squared:  0.5036 
## F-statistic: 37.42 on 10 and 349 DF,  p-value: < 2.2e-16
summary(b)$adj.r.squared #0.5036
## [1] 0.5035908
plot(b,which=1) 

tsplot(new_merged$blackhc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): 'x' is NULL so the result
## will be NULL
lines(ts(b$fitted.values,start = 0),col="red")

### anti-asian hate crime

n = length(asianHC$hate_crime)
myspec3 <- mvspec(asianHC$hate_crime, log = "no") #there are several noisy spikes at front since we did not detrend the data

#from midsemester project, detrending using lm was not satisfying. Thus, use non-parametric approach
asianhcTS <- ts(asianHC$hate_crime, start=c(1991, 1), end=c(2020, 12), frequency=12)
asianHC_decomp = decompose(asianhcTS)
plot(asianHC_decomp)

detrended3 = na.omit(asianHC_decomp$seasonal+asianHC_decomp$random)
tsplot(detrended3)

asianhc_spec =mvspec(detrended3)

which(myspec3$spec > 800) #1, 4
## [1] 1 4
omega3 = myspec3$freq[1] #0.03333333
1/omega3 #360 months
## [1] 360
omega3 = myspec3$freq[4] #0.03333333
1/omega3 #90 months
## [1] 90
t = new_merged$time
Xcos = cos(2*pi*t*omega3)
Xsin = sin(2*pi*t*omega3)
mod_spec3 <- lm(new_merged$asianhc~Xcos+Xsin)
summary(mod_spec3) #Adjusted R-squared:  0.02852 
## 
## Call:
## lm(formula = new_merged$asianhc ~ Xcos + Xsin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.030  -6.449  -1.673   4.782  30.994 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.8444     0.4317  41.338  < 2e-16 ***
## Xcos          1.7652     0.6105   2.891  0.00407 ** 
## Xsin         -1.2481     0.6105  -2.045  0.04164 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.19 on 357 degrees of freedom
## Multiple R-squared:  0.03394,    Adjusted R-squared:  0.02852 
## F-statistic:  6.27 on 2 and 357 DF,  p-value: 0.002107
AIC(mod_spec3)
## [1] 2540.75
plot(mod_spec3, which = 1)

tsplot(new_merged$asianhc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(mod_spec3$fitted.values,col="red")

#mod_spec4=testfunction(new_merged$asianhc, 1) #two sets of cos, sin
#using the printed value, I found omega1=0.08333333, omega2=0.002777778
#summary(mod_spec4) #0.07639 
#plot(mod_spec4, which=1)
#tsplot(new_merged$asianhc)
#lines(mod_spec4$fitted.values,col="red") 

#fit the model with high adj r-squared
#c=testfunction2(new_merged$asianhc, 0.5) 
#summary(c) 
#summary(c)$adj.r.squared #0.5021
#plot(b,which=1) 
#tsplot(new_merged$asianhc)
#lines(c$fitted.values,col="red") 

2. Regression with ARIMA errors (xreg)

all racial hate crime

mod_select<- lm(hc ~ . -asianhc-blackhc, data=new_merged)
summary(mod_select) #0.4148 , time NA ??
## 
## Call:
## lm(formula = hc ~ . - asianhc - blackhc, data = new_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -216.42  -41.80   -4.80   33.53  644.94 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 284.09825   28.06003  10.125  < 2e-16 ***
## cpi          -1.82312    0.11326 -16.096  < 2e-16 ***
## cpi_notrend  -1.57014    1.56425  -1.004  0.31618    
## unemp         5.95975    2.29718   2.594  0.00987 ** 
## temp          1.57198    0.26684   5.891 8.94e-09 ***
## non_rhc       1.58937    0.08676  18.319  < 2e-16 ***
## time               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.5 on 354 degrees of freedom
## Multiple R-squared:  0.614,  Adjusted R-squared:  0.6086 
## F-statistic: 112.6 on 5 and 354 DF,  p-value: < 2.2e-16
mod_back=step(mod_select, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_back)

summary(mod_back) #unemp,temp,nonrHC, cpi. adjusted R^2=0.6086
## 
## Call:
## lm(formula = hc ~ cpi + unemp + temp + non_rhc, data = new_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -217.13  -40.44   -3.83   31.87  653.96 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 291.46929   27.08237  10.762  < 2e-16 ***
## cpi          -1.83070    0.11301 -16.199  < 2e-16 ***
## unemp         5.23780    2.18170   2.401   0.0169 *  
## temp          1.52254    0.26226   5.805 1.43e-08 ***
## non_rhc       1.59414    0.08663  18.401  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.5 on 355 degrees of freedom
## Multiple R-squared:  0.6129, Adjusted R-squared:  0.6086 
## F-statistic: 140.5 on 4 and 355 DF,  p-value: < 2.2e-16
vif(mod_back) #all below 5
##      cpi    unemp     temp  non_rhc 
## 1.311929 1.094028 1.113487 1.509698
#1. fit best lm model 
#First, try using only non-hate crime-related variables
fit_hc= lm(hc~ temp + unemp + cpi + non_rhc, data=new_merged, na.action=NULL)
summary(fit_hc)       # regression results
## 
## Call:
## lm(formula = hc ~ temp + unemp + cpi + non_rhc, data = new_merged, 
##     na.action = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -217.13  -40.44   -3.83   31.87  653.96 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 291.46929   27.08237  10.762  < 2e-16 ***
## temp          1.52254    0.26226   5.805 1.43e-08 ***
## unemp         5.23780    2.18170   2.401   0.0169 *  
## cpi          -1.83070    0.11301 -16.199  < 2e-16 ***
## non_rhc       1.59414    0.08663  18.401  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.5 on 355 degrees of freedom
## Multiple R-squared:  0.6129, Adjusted R-squared:  0.6086 
## F-statistic: 140.5 on 4 and 355 DF,  p-value: < 2.2e-16
vif(fit_hc)
##     temp    unemp      cpi  non_rhc 
## 1.113487 1.094028 1.311929 1.509698
plot(fit_hc$residuals)
lines(lowess(fit_hc$residuals)) #pretty linear. take residuals and fit ARMA

#2. check stationarity for the residuals and fit ARMA
x_t=fit_hc$residuals


tsplot(x_t) #concern with spikes and trend
adf.test(x_t) #did not passed
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_t
## Dickey-Fuller = -3.2323, Lag order = 7, p-value = 0.0827
## alternative hypothesis: stationary
pp.test(x_t) #pass
## Warning in pp.test(x_t): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  x_t
## Dickey-Fuller Z(alpha) = -222.16, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(x_t) #pass
## Warning in kpss.test(x_t): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  x_t
## KPSS Level = 0.17177, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(x_t)) #pass
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -223.08  -28.49    6.76   39.83  606.56 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.10242    0.03703  -2.766  0.00598 ** 
## yd.diff.lag1 -0.47013    0.06053  -7.767 8.95e-14 ***
## yd.diff.lag2 -0.32054    0.06378  -5.025 8.03e-07 ***
## yd.diff.lag3 -0.18405    0.06184  -2.976  0.00312 ** 
## yd.diff.lag4 -0.01820    0.05399  -0.337  0.73619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.07 on 350 degrees of freedom
## Multiple R-squared:  0.2587, Adjusted R-squared:  0.2481 
## F-statistic: 24.42 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.7657 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(x_t) #decay to 0, might be fixed with AR(1)
pacf(x_t) #AR(1) or AR(2)

mean(x_t) #close to 0
## [1] 9.341341e-16
#start with AR(1)
error_mod1=Arima(x_t, order=c(1,0,0), include.mean = FALSE)
summary(error_mod1) #AICc=4003.9   BIC=4011.64
## Series: x_t 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.4488
## s.e.  0.0471
## 
## sigma^2 = 3925:  log likelihood = -1999.94
## AIC=4003.87   AICc=4003.9   BIC=4011.64
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.06343757 62.55999 43.36681 39.93476 206.3167 0.8287035
##                    ACF1
## Training set -0.0664218
acf(error_mod1$residuals) #some lags
pacf(error_mod1$residuals) #some lags

#try AR(2)
error_mod2=Arima(x_t, order=c(2,0,0), include.mean = FALSE)
summary(error_mod2) #AICc=3998.11   BIC=4009.71
## Series: x_t 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2
##       0.3833  0.1469
## s.e.  0.0521  0.0522
## 
## sigma^2 = 3851:  log likelihood = -1996.02
## AIC=3998.05   AICc=3998.11   BIC=4009.71
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.1232347 61.88002 42.37744 48.48712 198.3662 0.8097975
##                     ACF1
## Training set -0.01776657
coeftest(error_mod2) #all sig.
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value  Pr(>|z|)    
## ar1 0.383252   0.052073  7.3599 1.841e-13 ***
## ar2 0.146884   0.052210  2.8133  0.004903 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod2$residuals) #three lags seem random
pacf(error_mod2$residuals) #three lags seem random

#try auto arima
error_mod3=auto.arima(x_t)
summary(error_mod3) #(1,0,2) AICc=3992.63   BIC=4008.06
## Series: x_t 
## ARIMA(1,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ma1      ma2
##       0.8909  -0.5405  -0.1081
## s.e.  0.0698   0.0922   0.0708
## 
## sigma^2 = 3780:  log likelihood = -1992.26
## AIC=3992.51   AICc=3992.63   BIC=4008.06
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.6995925 61.22625 41.55968 40.85384 219.5544 0.7941707
##                      ACF1
## Training set 0.0003125806
coeftest(error_mod3) #ma2 not sig.
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.890935   0.069808 12.7627 < 2.2e-16 ***
## ma1 -0.540512   0.092213 -5.8616 4.586e-09 ***
## ma2 -0.108135   0.070785 -1.5277    0.1266    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod3$residuals) #only two lags, 10 and 25 but small
pacf(error_mod3$residuals) #only two lags, 10 and 25 but small

#try 101
error_mod4=Arima(x_t, order=c(1,0,1), include.mean=FALSE)
summary(error_mod4) # AICc=3992.66   BIC=4004.25 #better than outo arima
## Series: x_t 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##         ar1      ma1
##       0.800  -0.4697
## s.e.  0.071   0.1082
## 
## sigma^2 = 3792:  log likelihood = -1993.3
## AIC=3992.6   AICc=3992.66   BIC=4004.25
## 
## Training set error measures:
##                     ME    RMSE      MAE      MPE    MAPE      MASE       ACF1
## Training set 0.3223018 61.4083 41.76911 46.61081 208.233 0.7981728 0.02286966
coeftest(error_mod4) #all sig.
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.799976   0.070979 11.2707 < 2.2e-16 ***
## ma1 -0.469693   0.108171 -4.3421 1.411e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod4$residuals) #small lags at 10,25
pacf(error_mod4$residuals) #small lags at 10,25

#add more terms
error_mod5=Arima(x_t, order=c(2,0,1), include.mean=FALSE)
summary(error_mod5) # AICc=3992.43   BIC=4007.86
## Series: x_t 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.1135  -0.1826  -0.7605
## s.e.  0.1429   0.0960   0.1271
## 
## sigma^2 = 3778:  log likelihood = -1992.16
## AIC=3992.32   AICc=3992.43   BIC=4007.86
## 
## Training set error measures:
##                     ME     RMSE     MAE      MPE     MAPE      MASE
## Training set 0.8624684 61.20797 41.4777 42.40561 222.5925 0.7926041
##                      ACF1
## Training set -0.002950645
coeftest(error_mod5) #all sig.
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value  Pr(>|z|)    
## ar1  1.113539   0.142938  7.7904 6.680e-15 ***
## ar2 -0.182571   0.096028 -1.9012   0.05727 .  
## ma1 -0.760540   0.127077 -5.9849 2.165e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod5$residuals) #small lags at 10,25
pacf(error_mod5$residuals) #small lags at 10,25


#check stationarity for the best model
tsplot(error_mod4$residuals) #looks stationary, outliers concerned
adf.test(error_mod4$residuals)
## Warning in adf.test(error_mod4$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  error_mod4$residuals
## Dickey-Fuller = -5.6754, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(error_mod4$residuals)
## Warning in pp.test(error_mod4$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  error_mod4$residuals
## Dickey-Fuller Z(alpha) = -338.03, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(error_mod4$residuals)
## Warning in kpss.test(error_mod4$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  error_mod4$residuals
## KPSS Level = 0.1002, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(error_mod4$residuals)) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -222.53  -28.89    9.26   44.36  588.33 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.13901    0.04714  -2.949   0.0034 ** 
## yd.diff.lag1 -0.65347    0.06472 -10.096  < 2e-16 ***
## yd.diff.lag2 -0.52137    0.06905  -7.550 3.81e-13 ***
## yd.diff.lag3 -0.33649    0.06556  -5.132 4.75e-07 ***
## yd.diff.lag4 -0.09464    0.05319  -1.779   0.0761 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.46 on 350 degrees of freedom
## Multiple R-squared:  0.3968, Adjusted R-squared:  0.3882 
## F-statistic: 46.05 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.9488 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
#final model
final_mod1=Arima(new_merged$hc, order=c(1,0,1), xreg =cbind(new_merged$temp, new_merged$unemp, new_merged$cpi, new_merged$non_rhc))
summary(final_mod1) #AICc=3987.95   BIC=4018.62
## Series: new_merged$hc 
## Regression with ARIMA(1,0,1) errors 
## 
## Coefficients:
##          ar1      ma1  intercept  new_merged$temp  new_merged$unemp
##       0.9607  -0.7288   140.0553           1.7436           17.3957
## s.e.  0.0311   0.0751   135.1145           0.2318            4.7038
##       new_merged$cpi  new_merged$non_rhc
##              -1.1975              1.3340
## s.e.          0.6218              0.0858
## 
## sigma^2 = 3682:  log likelihood = -1985.77
## AIC=3987.54   AICc=3987.95   BIC=4018.62
## 
## Training set error measures:
##                    ME     RMSE      MAE    MPE     MAPE      MASE       ACF1
## Training set 0.975784 60.08839 39.84037 -1.061 10.19967 0.7771935 0.09089999
tsplot(final_mod1$residuals)

acf(final_mod1$residuals, lag=50) #looks okay
pacf(final_mod1$residuals, lag=50) #5, 10, 25 lags with small sig. 3/50
adf.test(final_mod1$residuals)
## Warning in adf.test(final_mod1$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  final_mod1$residuals
## Dickey-Fuller = -6.3053, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(final_mod1$residuals)
## Warning in pp.test(final_mod1$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  final_mod1$residuals
## Dickey-Fuller Z(alpha) = -299.47, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(final_mod1$residuals)
## Warning in kpss.test(final_mod1$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  final_mod1$residuals
## KPSS Level = 0.17817, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(final_mod1$residuals)) #passed all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -168.60  -24.25    8.89   44.94  597.31 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.17339    0.05031  -3.447 0.000637 ***
## yd.diff.lag1 -0.53456    0.06578  -8.127 7.68e-15 ***
## yd.diff.lag2 -0.45316    0.06767  -6.697 8.49e-11 ***
## yd.diff.lag3 -0.27966    0.06367  -4.393 1.49e-05 ***
## yd.diff.lag4 -0.07174    0.05331  -1.346 0.179274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.89 on 350 degrees of freedom
## Multiple R-squared:  0.3519, Adjusted R-squared:  0.3426 
## F-statistic:    38 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -3.4467 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(final_mod1$residuals) #concern with outliers on the right side but pretty normal
## [1] 354 129
shapiro.test(final_mod1$residuals) #very small, 2.2e-16. Did not passed
## 
##  Shapiro-Wilk normality test
## 
## data:  final_mod1$residuals
## W = 0.80699, p-value < 2.2e-16
#get R^2 of the model
prediction={}
for (i in 1:360){
  prediction=append(final_mod1$fitted[i], prediction)
}
prediction=rev(prediction)
cv_df=data.frame("predict"=prediction, "real"=new_merged$hc)
cor(cv_df$real,cv_df$predict)^2 # R^2 = 0.714976
## [1] 0.714976
#cor(fitted(final_mod1), allHC$hate_crime)^2
RMSE=sqrt(mean((cv_df$real - cv_df$predict)^2))
N_RMSE=RMSE/(max(cv_df$real)-min(cv_df$real))
round(RMSE , digits = 3)
## [1] 60.088
round(N_RMSE, digits = 3) # 0.053
## [1] 0.053
tsplot(new_merged$hc, lwd=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(final_mod1$fitted, col="red",  pch=50)

#cross-validation
a=new_merged$hc[1:348,] #including only through 2018
t=1:348
sarimaMod=Arima(new_merged$hc[1:348,],order=c(1,0,1), xreg =cbind(new_merged$temp[1:348,], new_merged$unemp[1:348,], new_merged$cpi[1:348,], new_merged$non_rhc[1:348,]))
summary(sarimaMod) #AICc=3592.9   BIC=3622.99
## Series: new_merged$hc[1:348, ] 
## Regression with ARIMA(1,0,1) errors 
## 
## Coefficients:
##          ar1      ma1  intercept  new_merged$temp[1:348, ]
##       0.9199  -0.7308   378.0114                    1.5067
## s.e.  0.0435   0.0700    60.1012                    0.1946
##       new_merged$unemp[1:348, ]  new_merged$cpi[1:348, ]
##                         -1.6573                  -1.7443
## s.e.                     4.3737                   0.2475
##       new_merged$non_rhc[1:348, ]
##                            1.2742
## s.e.                       0.0718
## 
## sigma^2 = 2491:  log likelihood = -1851.23
## AIC=3718.47   AICc=3718.89   BIC=3749.28
## 
## Training set error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 0.8619917 49.40817 36.08097 -0.9283992 9.558511 0.7530432
##                    ACF1
## Training set 0.03188773
fcast2 <- forecast(sarimaMod, xreg=cbind(rep(mean(new_merged$temp[1:348,]), 12), rep(mean(new_merged$unemp[1:348,]), 12), rep(mean(new_merged$cpi[1:348,]),12), rep(mean(new_merged$non_rhc[1:348,]),12))) 
## Warning in forecast.forecast_ARIMA(sarimaMod, xreg =
## cbind(rep(mean(new_merged$temp[1:348, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast2)

#real data
real_value=new_merged$hc[c(349:360)]

#compare with real data
prediction2={}
for (i in 1:12){
  print(fcast2$mean[i])
  prediction2=append(fcast2$mean[i], prediction2)
}
## [1] 381.2731
## [1] 381.3204
## [1] 381.364
## [1] 381.4041
## [1] 381.4409
## [1] 381.4749
## [1] 381.5061
## [1] 381.5347
## [1] 381.5611
## [1] 381.5854
## [1] 381.6078
## [1] 381.6283
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.1312307
## [1] 0.1312307
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 284.806
round(N_RMSE1, digits = 3) #0.346
## [1] 0.346
#make prediction
final_mod1=Arima(new_merged$hc, order=c(1,0,1), xreg =cbind(new_merged$temp, new_merged$unemp, new_merged$cpi, new_merged$non_rhc))
xreg1_pred=forecast(final_mod1, xreg=cbind(rep(mean(new_merged$temp), 12), rep(mean(new_merged$unemp), 12), rep(mean(new_merged$cpi),12), rep(mean(new_merged$non_rhc),12)))
## Warning in forecast.forecast_ARIMA(final_mod1, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg1_pred)+xlab("Time")+ylab("all racial hate crimes")

xreg2_pred=forecast(final_mod1, xreg=cbind(rep(mean(new_merged$temp), 24), rep(mean(new_merged$unemp), 24), rep(mean(new_merged$cpi),24), rep(mean(new_merged$non_rhc),24)))
## Warning in forecast.forecast_ARIMA(final_mod1, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg2_pred)+xlab("Time")+ylab("all racial hate crimes")

anti-Black of African American hate crime

#black HC
mod_select2<- lm(blackhc ~ . -asianhc-hc, data=new_merged)
summary(mod_select2) #0.4148 , time NA ??
## 
## Call:
## lm(formula = blackhc ~ . - asianhc - hc, data = new_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -223.32  -24.51   -3.06   18.68  453.75 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 172.17548   18.67241   9.221  < 2e-16 ***
## cpi          -0.85810    0.07537 -11.385  < 2e-16 ***
## cpi_notrend  -0.21537    1.04093  -0.207   0.8362    
## unemp         2.54736    1.52865   1.666   0.0965 .  
## temp          1.02200    0.17757   5.755 1.87e-08 ***
## non_rhc       0.62237    0.05773  10.780  < 2e-16 ***
## time               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.91 on 354 degrees of freedom
## Multiple R-squared:  0.423,  Adjusted R-squared:  0.4148 
## F-statistic:  51.9 on 5 and 354 DF,  p-value: < 2.2e-16
mod_back2=step(mod_select2, direction="backward", trace=0)
plot(mod_back2)

summary(mod_back2) #unemp,temp,nonrHC, cpi. adjusted R^2=0.4164
## 
## Call:
## lm(formula = blackhc ~ cpi + unemp + temp + non_rhc, data = new_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -223.45  -24.28   -3.28   18.83  454.99 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 173.18652   17.99733   9.623  < 2e-16 ***
## cpi          -0.85914    0.07510 -11.440  < 2e-16 ***
## unemp         2.44833    1.44983   1.689   0.0922 .  
## temp          1.01522    0.17428   5.825 1.28e-08 ***
## non_rhc       0.62303    0.05757  10.822  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.85 on 355 degrees of freedom
## Multiple R-squared:  0.4229, Adjusted R-squared:  0.4164 
## F-statistic: 65.04 on 4 and 355 DF,  p-value: < 2.2e-16
vif(mod_back2) #all below 5
##      cpi    unemp     temp  non_rhc 
## 1.311929 1.094028 1.113487 1.509698
#1. fit best lm model 
fit_hc2= lm(blackhc~ temp + unemp + cpi + non_rhc, data=new_merged, na.action=NULL)
summary(fit_hc2)       # regression results
## 
## Call:
## lm(formula = blackhc ~ temp + unemp + cpi + non_rhc, data = new_merged, 
##     na.action = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -223.45  -24.28   -3.28   18.83  454.99 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 173.18652   17.99733   9.623  < 2e-16 ***
## temp          1.01522    0.17428   5.825 1.28e-08 ***
## unemp         2.44833    1.44983   1.689   0.0922 .  
## cpi          -0.85914    0.07510 -11.440  < 2e-16 ***
## non_rhc       0.62303    0.05757  10.822  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.85 on 355 degrees of freedom
## Multiple R-squared:  0.4229, Adjusted R-squared:  0.4164 
## F-statistic: 65.04 on 4 and 355 DF,  p-value: < 2.2e-16
vif(fit_hc2)
##     temp    unemp      cpi  non_rhc 
## 1.113487 1.094028 1.311929 1.509698
plot(fit_hc2$residuals)
lines(lowess(fit_hc2$residuals)) #pretty linear. take residuals and fit ARMA

#2. check stationarity for the residuals and fit ARMA
x_t2=fit_hc2$residuals


tsplot(x_t2) #concern with spikes and trend

adf.test(x_t2) #did not passed
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_t2
## Dickey-Fuller = -3.2257, Lag order = 7, p-value = 0.08382
## alternative hypothesis: stationary
pp.test(x_t2) #pass
## Warning in pp.test(x_t2): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  x_t2
## Dickey-Fuller Z(alpha) = -163.48, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(x_t2) #pass
## Warning in kpss.test(x_t2): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  x_t2
## KPSS Level = 0.20784, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(x_t2)) #pass 5pct
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -212.37  -13.31    3.49   22.39  443.79 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.05583    0.02705  -2.064  0.03972 *  
## yd.diff.lag1 -0.40120    0.05726  -7.007 1.26e-11 ***
## yd.diff.lag2 -0.30311    0.05979  -5.070 6.46e-07 ***
## yd.diff.lag3 -0.18723    0.05869  -3.190  0.00155 ** 
## yd.diff.lag4 -0.06528    0.05399  -1.209  0.22741    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.28 on 350 degrees of freedom
## Multiple R-squared:  0.191,  Adjusted R-squared:  0.1794 
## F-statistic: 16.53 on 5 and 350 DF,  p-value: 1.186e-14
## 
## 
## Value of test-statistic is: -2.0644 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(x_t2) #decay to 0, might be fixed with AR(1)

pacf(x_t2) #AR(1)

mean(x_t2) #close to 0
## [1] 4.658626e-15
#start with AR(1)
error_mod1.2=Arima(x_t2, order=c(1,0,0), include.mean = FALSE)
summary(error_mod1.2) #AICc=3649.91   BIC=3657.65
## Series: x_t2 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.5700
## s.e.  0.0434
## 
## sigma^2 = 1467:  log likelihood = -1822.94
## AIC=3649.88   AICc=3649.91   BIC=3657.65
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
## Training set 0.050652 38.25346 24.15232 72.73581 232.2421 0.8515898 -0.07009319
acf(error_mod1.2$residuals, 50) #2/50

pacf(error_mod1.2$residuals, 50) #1/50 looks good

#start with AR(1)
error_mod1.3=Arima(x_t2, order=c(1,0,1), include.mean = FALSE)
summary(error_mod1.3) # AICc=3643.67   BIC=3655.26
## Series: x_t2 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ma1
##       0.7775  -0.3241
## s.e.  0.0655   0.1054
## 
## sigma^2 = 1438:  log likelihood = -1818.8
## AIC=3643.6   AICc=3643.67   BIC=3655.26
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE     ACF1
## Training set 0.1599784 37.81305 23.33107 67.89707 233.5927 0.8226335 0.021852
acf(error_mod1.3$residuals, 50) #good

pacf(error_mod1.3$residuals, 50) #1/50 looks good

#add terms
error_mod1.4=Arima(x_t2, order=c(2,0,1), include.mean = FALSE)
summary(error_mod1.4) #  AICc=3638.91   BIC=3654.34
## Series: x_t2 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.2920  -0.3249  -0.8277
## s.e.  0.0949   0.0782   0.0728
## 
## sigma^2 = 1414:  log likelihood = -1815.4
## AIC=3638.8   AICc=3638.91   BIC=3654.34
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.7873559 37.44777 22.74578 29.64259 289.8964 0.8019963
##                      ACF1
## Training set -0.002241382
acf(error_mod1.4$residuals, 50) #good

pacf(error_mod1.4$residuals, 50) #good

#add terms
error_mod1.5=Arima(x_t2, order=c(2,0,2), include.mean = FALSE)
summary(error_mod1.5) #AICc=3640.95   BIC=3660.21 got worse
## Series: x_t2 
## ARIMA(2,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1      ma2
##       1.2645  -0.2989  -0.7980  -0.0212
## s.e.  0.2511   0.2300   0.2567   0.1666
## 
## sigma^2 = 1418:  log likelihood = -1815.39
## AIC=3640.78   AICc=3640.95   BIC=3660.21
## 
## Training set error measures:
##                   ME     RMSE      MAE      MPE     MAPE      MASE         ACF1
## Training set 0.79204 37.44663 22.74146 28.39091 291.2595 0.8018441 -0.004337836
acf(error_mod1.5$residuals, 50) #good

pacf(error_mod1.5$residuals, 50) #good

#auto arima
error_mod1.6=auto.arima(x_t2)
summary(error_mod1.6) #2,0,1
## Series: x_t2 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.2920  -0.3249  -0.8277
## s.e.  0.0949   0.0782   0.0728
## 
## sigma^2 = 1414:  log likelihood = -1815.4
## AIC=3638.8   AICc=3638.91   BIC=3654.34
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 0.7873559 37.44777 22.74578 29.64259 289.8964 0.8019963
##                      ACF1
## Training set -0.002241382
acf(error_mod1.6$residuals, 50) #good

pacf(error_mod1.6$residuals, 50) #good

#check stationarity for the best model
tsplot(error_mod1.4$residuals) #looks stationary, outliers concerned

adf.test(error_mod1.4$residuals)
## Warning in adf.test(error_mod1.4$residuals): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  error_mod1.4$residuals
## Dickey-Fuller = -6.2597, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(error_mod1.4$residuals)
## Warning in pp.test(error_mod1.4$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  error_mod1.4$residuals
## Dickey-Fuller Z(alpha) = -355.99, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(error_mod1.4$residuals)
## Warning in kpss.test(error_mod1.4$residuals): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  error_mod1.4$residuals
## KPSS Level = 0.072607, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(error_mod1.4$residuals)) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -202.38  -14.25    4.47   22.49  439.70 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.08316    0.03702  -2.246   0.0253 *  
## yd.diff.lag1 -0.73245    0.06045 -12.117  < 2e-16 ***
## yd.diff.lag2 -0.55559    0.06821  -8.145 6.75e-15 ***
## yd.diff.lag3 -0.35583    0.06605  -5.387 1.31e-07 ***
## yd.diff.lag4 -0.13673    0.05296  -2.582   0.0102 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.23 on 350 degrees of freedom
## Multiple R-squared:  0.4052, Adjusted R-squared:  0.3967 
## F-statistic: 47.68 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.2462 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
#final model
final_mod2=Arima(new_merged$blackhc, order=c(2,0,1), xreg =cbind(new_merged$temp, new_merged$unemp, new_merged$cpi, new_merged$non_rhc))
summary(final_mod2) #AICc=3602.76   BIC=3637.22
## Series: new_merged$blackhc 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2     ma1  intercept  new_merged$temp  new_merged$unemp
##       1.3523  -0.3618  -0.853    -4.1477           1.3253            5.4139
## s.e.  0.0753   0.0723   0.043   148.5399           0.1772            2.7158
##       new_merged$cpi  new_merged$non_rhc
##               0.2386              0.2815
## s.e.          0.7318              0.0511
## 
## sigma^2 = 1257:  log likelihood = -1792.12
## AIC=3602.24   AICc=3602.76   BIC=3637.22
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.5938505 35.06441 21.45948 -1.549571 10.29595 0.7554376
##                    ACF1
## Training set 0.02318245
acf(final_mod2$residuals, lag=50) #looks okay

pacf(final_mod2$residuals, lag=50) #looks good

adf.test(final_mod2$residuals)
## Warning in adf.test(final_mod2$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  final_mod2$residuals
## Dickey-Fuller = -5.7921, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(final_mod2$residuals)
## Warning in pp.test(final_mod2$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  final_mod2$residuals
## Dickey-Fuller Z(alpha) = -336.08, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(final_mod2$residuals)
## Warning in kpss.test(final_mod2$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  final_mod2$residuals
## KPSS Level = 0.33501, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(final_mod2$residuals)) #passed all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79.46 -12.34   5.41  24.89 448.71 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.27119    0.06651  -4.077 5.64e-05 ***
## yd.diff.lag1 -0.55479    0.07429  -7.467 6.57e-13 ***
## yd.diff.lag2 -0.46727    0.07367  -6.343 6.98e-10 ***
## yd.diff.lag3 -0.32171    0.06707  -4.797 2.39e-06 ***
## yd.diff.lag4 -0.12645    0.05321  -2.376    0.018 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.89 on 350 degrees of freedom
## Multiple R-squared:  0.4132, Adjusted R-squared:  0.4049 
## F-statistic:  49.3 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -4.0775 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(final_mod2$residuals) #concern with outliers on the right side but pretty normal

## [1] 354  61
shapiro.test(final_mod2$residuals) #very small, 2.2e-16. Did not passed
## 
##  Shapiro-Wilk normality test
## 
## data:  final_mod2$residuals
## W = 0.7041, p-value < 2.2e-16
#get R^2 of the model
prediction={}
for (i in 1:360){
  prediction=append(final_mod2$fitted[i], prediction)
}
prediction=rev(prediction)
cv_df=data.frame("predict"=prediction, "real"=new_merged$blackhc)
cor(cv_df$real,cv_df$predict)^2 # R^2 = 0.6724792
## [1] 0.6724792
RMSE=sqrt(mean((cv_df$real - cv_df$predict)^2))
N_RMSE=RMSE/(max(cv_df$real)-min(cv_df$real))
round(RMSE , digits = 3)
## [1] 35.064
round(N_RMSE, digits = 3) # 0.058
## [1] 0.058
tsplot(new_merged$blackhc, lwd=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(final_mod2$fitted, col="red",  pch=50)

#cross-validation
a=new_merged$blackhc[1:336,] #including only through 2019
t=1:336
sarimaMod2=Arima(new_merged$blackhc[1:336,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:336,], new_merged$unemp[1:336,], new_merged$cpi[1:336,], new_merged$non_rhc[1:336,]))
summary(sarimaMod2) #AICc=3718.89   BIC=3749.28
## Series: new_merged$blackhc[1:336, ] 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  new_merged$temp[1:336, ]
##       1.1975  -0.2064  -0.7680     3.8370                    1.1595
## s.e.  0.0817   0.0782   0.0534   225.6145                    0.1206
##       new_merged$unemp[1:336, ]  new_merged$cpi[1:336, ]
##                          2.1096                   0.3094
## s.e.                     3.4498                   1.0811
##       new_merged$non_rhc[1:336, ]
##                            0.2459
## s.e.                       0.0379
## 
## sigma^2 = 667.8:  log likelihood = -1566.39
## AIC=3150.78   AICc=3151.33   BIC=3185.14
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.3559803 25.53286 19.80543 -1.169533 9.838228 0.7524177
##                     ACF1
## Training set 0.001820025
fcast3 <- forecast(sarimaMod2, xreg=cbind(rep(mean(new_merged$temp[1:336,]), 12), rep(mean(new_merged$unemp[1:336,]), 12), rep(mean(new_merged$cpi[1:336,]),12), rep(mean(new_merged$non_rhc[1:336,]),12))) 
## Warning in forecast.forecast_ARIMA(sarimaMod2, xreg =
## cbind(rep(mean(new_merged$temp[1:336, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast3)

#real data
real_value=new_merged$blackhc[c(337:348)]

#compare with real data
prediction2={}
for (i in 1:12){
  print(fcast3$mean[i])
  prediction2=append(fcast3$mean[i], prediction2)
}
## [1] 152.6256
## [1] 149.4486
## [1] 149.1586
## [1] 149.467
## [1] 149.8962
## [1] 150.3464
## [1] 150.797
## [1] 151.2436
## [1] 151.6855
## [1] 152.1224
## [1] 152.5544
## [1] 152.9815
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.01080143
## [1] 0.01080143
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 36.944
round(N_RMSE1, digits = 3) #0.528
## [1] 0.528
#make prediction
xreg3_pred=forecast(final_mod2, xreg=cbind(rep(mean(new_merged$temp), 12), rep(mean(new_merged$unemp), 12), rep(mean(new_merged$cpi),12), rep(mean(new_merged$non_rhc),12)))
## Warning in forecast.forecast_ARIMA(final_mod2, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg3_pred)+xlab("Time")+ylab("anti-Black or African American hate crimes")

xreg3_pred=forecast(final_mod2, xreg=cbind(rep(mean(new_merged$temp), 24), rep(mean(new_merged$unemp), 24), rep(mean(new_merged$cpi),24), rep(mean(new_merged$non_rhc),24)))
## Warning in forecast.forecast_ARIMA(final_mod2, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg3_pred)+xlab("Time")+ylab("anti-Black or African American hate crimes")

anti-Asian hate crime

#asian HC
mod_select3<- lm(asianhc ~ . -blackhc-hc-cpi_notrend, data=new_merged)
summary(mod_select3) #0.382 , 
## 
## Call:
## lm(formula = asianhc ~ . - blackhc - hc - cpi_notrend, data = new_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.013  -4.073  -1.121   3.102  40.401 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 95.277000  18.702777   5.094 5.71e-07 ***
## cpi         -0.628370   0.144624  -4.345 1.82e-05 ***
## unemp        0.232443   0.212856   1.092 0.275567    
## temp         0.035546   0.024726   1.438 0.151425    
## non_rhc      0.055613   0.008039   6.918 2.16e-11 ***
## time         0.174372   0.052402   3.328 0.000968 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.533 on 354 degrees of freedom
## Multiple R-squared:  0.3906, Adjusted R-squared:  0.382 
## F-statistic: 45.38 on 5 and 354 DF,  p-value: < 2.2e-16
mod_back3=step(mod_select3, direction="backward", trace=0)
plot(mod_back3)

summary(mod_back3) #time,temp,nonrHC, cpi. adjusted R^2=0.3817
## 
## Call:
## lm(formula = asianhc ~ cpi + temp + non_rhc + time, data = new_merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.805  -3.990  -1.220   3.146  40.384 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 90.198881  18.120360   4.978 1.01e-06 ***
## cpi         -0.577043   0.136810  -4.218 3.13e-05 ***
## temp         0.036042   0.024728   1.458  0.14585    
## non_rhc      0.053362   0.007773   6.865 2.97e-11 ***
## time         0.156456   0.049780   3.143  0.00181 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.534 on 355 degrees of freedom
## Multiple R-squared:  0.3885, Adjusted R-squared:  0.3817 
## F-statistic: 56.39 on 4 and 355 DF,  p-value: < 2.2e-16
vif(mod_back3) #all below 5
##        cpi       temp    non_rhc       time 
## 223.807553   1.152376   1.414682 225.650992
#1. fit best lm model 
fit_hc3= lm(blackhc~ temp + cpi + non_rhc +time, data=new_merged, na.action=NULL)
summary(fit_hc3)       # regression results
## 
## Call:
## lm(formula = blackhc ~ temp + cpi + non_rhc + time, data = new_merged, 
##     na.action = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -215.33  -25.31   -3.21   16.91  469.47 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 145.07366  130.42298   1.112    0.267    
## temp          1.02744    0.17798   5.773  1.7e-08 ***
## cpi          -0.51098    0.98470  -0.519    0.604    
## non_rhc       0.59771    0.05594  10.684  < 2e-16 ***
## time         -0.11849    0.35830  -0.331    0.741    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.03 on 355 degrees of freedom
## Multiple R-squared:  0.4184, Adjusted R-squared:  0.4119 
## F-statistic: 63.86 on 4 and 355 DF,  p-value: < 2.2e-16
vif(fit_hc3)
##       temp        cpi    non_rhc       time 
##   1.152376 223.807553   1.414682 225.650992
plot(fit_hc3$residuals)
lines(lowess(fit_hc3$residuals)) #pretty linear. take residuals and fit ARMA

#2. check stationarity for the residuals and fit ARMA
x_t3=fit_hc3$residuals


tsplot(x_t3) #concern with spikes and trend

adf.test(x_t3) #did not passed
## 
##  Augmented Dickey-Fuller Test
## 
## data:  x_t3
## Dickey-Fuller = -3.2932, Lag order = 7, p-value = 0.07241
## alternative hypothesis: stationary
pp.test(x_t3) #pass
## Warning in pp.test(x_t3): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  x_t3
## Dickey-Fuller Z(alpha) = -157.93, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(x_t3) #pass
## Warning in kpss.test(x_t3): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  x_t3
## KPSS Level = 0.18798, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(x_t3)) #pass 5pct
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -201.84  -13.41    3.73   21.00  446.75 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.05853    0.02722  -2.150  0.03223 *  
## yd.diff.lag1 -0.38172    0.05725  -6.667 1.02e-10 ***
## yd.diff.lag2 -0.27221    0.05952  -4.573 6.68e-06 ***
## yd.diff.lag3 -0.17572    0.05848  -3.005  0.00285 ** 
## yd.diff.lag4 -0.05995    0.05406  -1.109  0.26823    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.16 on 350 degrees of freedom
## Multiple R-squared:  0.1787, Adjusted R-squared:  0.1669 
## F-statistic: 15.23 on 5 and 350 DF,  p-value: 1.519e-13
## 
## 
## Value of test-statistic is: -2.1501 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(x_t3) #decay to 0, might be fixed with AR(1)

pacf(x_t3) #AR(1)

mean(x_t3) #close to 0
## [1] -3.795264e-15
#start with AR(1)
error_mod2.1=Arima(x_t3, order=c(1,0,0), include.mean = FALSE)
summary(error_mod2.1) #AICc=3645.18   BIC=3652.92
## Series: x_t3 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.5821
## s.e.  0.0429
## 
## sigma^2 = 1448:  log likelihood = -1820.57
## AIC=3645.15   AICc=3645.18   BIC=3652.92
## 
## Training set error measures:
##                      ME     RMSE     MAE      MPE     MAPE      MASE
## Training set 0.05611259 38.00186 23.8664 65.26358 174.9703 0.8518535
##                     ACF1
## Training set -0.07271268
acf(error_mod2.1$residuals, 50) #looks good

pacf(error_mod2.1$residuals, 50) #looks good

#add terms
error_mod2.2=Arima(x_t3, order=c(1,0,1), include.mean = FALSE)
summary(error_mod2.2) # ICc=3639.66   BIC=3651.25
## Series: x_t3 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ma1
##       0.7643  -0.2869
## s.e.  0.0632   0.0989
## 
## sigma^2 = 1422:  log likelihood = -1816.79
## AIC=3639.59   AICc=3639.66   BIC=3651.25
## 
## Training set error measures:
##                    ME    RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set 0.149849 37.6022 23.11698 68.77424 172.2578 0.8251048 0.01373493
acf(error_mod2.2$residuals, 50) #good

pacf(error_mod2.2$residuals, 50) #1/50 looks good

#add terms
error_mod2.3=Arima(x_t3, order=c(2,0,1), include.mean = FALSE)
summary(error_mod2.3) #  AICc=3637.11   BIC=3652.54
## Series: x_t3 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.3057  -0.3431  -0.8179
## s.e.  0.1101   0.0874   0.0893
## 
## sigma^2 = 1407:  log likelihood = -1814.5
## AIC=3637   AICc=3637.11   BIC=3652.54
## 
## Training set error measures:
##                     ME    RMSE      MAE     MPE     MAPE      MASE         ACF1
## Training set 0.7075085 37.3557 22.57958 68.9171 180.8811 0.8059235 -0.007267818
acf(error_mod2.3$residuals, 50) #good

pacf(error_mod2.3$residuals, 50) #good

#add terms
error_mod2.4=Arima(x_t3, order=c(2,0,2), include.mean = FALSE)
summary(error_mod2.4) #AICc=3639.11   BIC=3658.37 got worse
## Series: x_t3 
## ARIMA(2,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       1.3556  -0.3892  -0.8715  0.0352
## s.e.  0.2395   0.2160   0.2475  0.1540
## 
## sigma^2 = 1411:  log likelihood = -1814.47
## AIC=3638.94   AICc=3639.11   BIC=3658.37
## 
## Training set error measures:
##                     ME    RMSE      MAE      MPE     MAPE     MASE        ACF1
## Training set 0.7070318 37.3532 22.58601 69.18505 181.2759 0.806153 -0.00365553
acf(error_mod2.4$residuals, 50) #good

pacf(error_mod2.4$residuals, 50) #good

#auto arima
error_mod2.5=auto.arima(x_t3)
summary(error_mod2.5) #2,0,1
## Series: x_t3 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       1.3057  -0.3431  -0.8179
## s.e.  0.1101   0.0874   0.0893
## 
## sigma^2 = 1407:  log likelihood = -1814.5
## AIC=3637   AICc=3637.11   BIC=3652.54
## 
## Training set error measures:
##                     ME    RMSE      MAE     MPE     MAPE      MASE         ACF1
## Training set 0.7075085 37.3557 22.57958 68.9171 180.8811 0.8059235 -0.007267818
acf(error_mod2.5$residuals, 50) #good

pacf(error_mod2.5$residuals, 50) #good

#check stationarity for the best model
tsplot(error_mod2.3$residuals) #looks stationary, outliers concerned

adf.test(error_mod2.3$residuals)
## Warning in adf.test(error_mod2.3$residuals): p-value smaller than printed p-
## value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  error_mod2.3$residuals
## Dickey-Fuller = -6.2479, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(error_mod2.3$residuals)
## Warning in pp.test(error_mod2.3$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  error_mod2.3$residuals
## Dickey-Fuller Z(alpha) = -361.13, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(error_mod2.3$residuals)
## Warning in kpss.test(error_mod2.3$residuals): p-value greater than printed p-
## value
## 
##  KPSS Test for Level Stationarity
## 
## data:  error_mod2.3$residuals
## KPSS Level = 0.067833, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(error_mod2.3$residuals)) #pass all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -191.78  -15.07    4.57   21.83  439.03 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.08515    0.03745  -2.274   0.0236 *  
## yd.diff.lag1 -0.73747    0.06060 -12.170  < 2e-16 ***
## yd.diff.lag2 -0.54616    0.06848  -7.975 2.19e-14 ***
## yd.diff.lag3 -0.35418    0.06631  -5.341 1.67e-07 ***
## yd.diff.lag4 -0.13696    0.05295  -2.586   0.0101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.09 on 350 degrees of freedom
## Multiple R-squared:  0.4092, Adjusted R-squared:  0.4008 
## F-statistic: 48.48 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.2735 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
#final model
final_mod3=Arima(new_merged$asianhc, order=c(2,0,1), xreg =cbind(new_merged$temp, new_merged$cpi, new_merged$non_rhc, new_merged$time))
summary(final_mod3) #AICc=2277.47   BIC=2311.93
## Series: new_merged$asianhc 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  new_merged$temp  new_merged$cpi
##       1.1594  -0.1970  -0.8161    59.2817           0.0474         -0.3362
## s.e.  0.1131   0.0859   0.0939    40.0799           0.0259          0.3028
##       new_merged$non_rhc  new_merged$time
##                   0.0367           0.0817
## s.e.              0.0081           0.1065
## 
## sigma^2 = 31.75:  log likelihood = -1129.48
## AIC=2276.95   AICc=2277.47   BIC=2311.93
## 
## Training set error measures:
##                      ME     RMSE     MAE       MPE     MAPE      MASE
## Training set 0.05253393 5.571598 4.13377 -10.44372 27.63248 0.7991509
##                      ACF1
## Training set -0.003537299
acf(final_mod3$residuals, lag=50) #looks okay

pacf(final_mod3$residuals, lag=50) #looks good

adf.test(final_mod3$residuals)
## Warning in adf.test(final_mod3$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  final_mod3$residuals
## Dickey-Fuller = -6.8737, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(final_mod3$residuals)
## Warning in pp.test(final_mod3$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  final_mod3$residuals
## Dickey-Fuller Z(alpha) = -366.9, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(final_mod3$residuals)
## Warning in kpss.test(final_mod3$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  final_mod3$residuals
## KPSS Level = 0.15042, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(final_mod3$residuals)) #passed all
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.989  -2.942   0.560   4.069  37.646 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.10442    0.04064  -2.569   0.0106 *  
## yd.diff.lag1 -0.73078    0.06167 -11.850  < 2e-16 ***
## yd.diff.lag2 -0.54121    0.06941  -7.797 7.32e-14 ***
## yd.diff.lag3 -0.31351    0.06675  -4.697 3.80e-06 ***
## yd.diff.lag4 -0.11770    0.05278  -2.230   0.0264 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.08 on 350 degrees of freedom
## Multiple R-squared:  0.4164, Adjusted R-squared:  0.4081 
## F-statistic: 49.94 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -2.5691 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(final_mod3$residuals) #concern with outliers on the right side but pretty normal

## [1] 351 352
shapiro.test(final_mod3$residuals) #very small, 2.2e-16. Did not passed
## 
##  Shapiro-Wilk normality test
## 
## data:  final_mod3$residuals
## W = 0.93361, p-value = 1.397e-11
#get R^2 of the model
prediction={}
for (i in 1:360){
  prediction=append(final_mod3$fitted[i], prediction)
}
prediction=rev(prediction)
cv_df=data.frame("predict"=prediction, "real"=new_merged$asianhc)
cor(cv_df$real,cv_df$predict)^2 # R^2 = 0.5492432
## [1] 0.5492432
RMSE=sqrt(mean((cv_df$real - cv_df$predict)^2))
N_RMSE=RMSE/(max(cv_df$real)-min(cv_df$real))
round(RMSE , digits = 3)
## [1] 5.572
round(N_RMSE, digits = 3) # 0.116
## [1] 0.116
tsplot(new_merged$asianhc, lwd=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(final_mod3$fitted, col="red",  pch=50)

#cross-validation
a=new_merged$asianhc[1:336,] #including only through 2019
t=1:336
sarimaMod3=Arima(new_merged$asianhc[1:336,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:336,], new_merged$cpi[1:336,], new_merged$non_rhc[1:336,], new_merged$time[1:336,]))
summary(sarimaMod3) # AICc=2055.28   BIC=2089.09
## Series: new_merged$asianhc[1:336, ] 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  new_merged$temp[1:336, ]
##       0.9966  -0.0548  -0.8024    36.1527                    0.0303
## s.e.  0.1002   0.0731   0.0821    30.2853                    0.0214
##       new_merged$cpi[1:336, ]  new_merged$non_rhc[1:336, ]
##                       -0.1537                       0.0465
## s.e.                   0.2306                       0.0073
##       new_merged$time[1:336, ]
##                        -0.0022
## s.e.                    0.0827
## 
## sigma^2 = 25.71:  log likelihood = -1018.37
## AIC=2054.73   AICc=2055.28   BIC=2089.09
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.06622725 5.009553 3.920922 -9.415356 26.85912 0.7596929
##                      ACF1
## Training set -0.002104686
fcast4 <- forecast(sarimaMod3, xreg=cbind(rep(mean(new_merged$temp[1:336,]), 12), rep(mean(new_merged$cpi[1:336,]), 12), rep(mean(new_merged$non_rhc[1:336,]),12), rep(mean(new_merged$time[1:336,]),12))) 
## Warning in forecast.forecast_ARIMA(sarimaMod3, xreg =
## cbind(rep(mean(new_merged$temp[1:336, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast4)

#real data
real_value=new_merged$asianhc[c(337:348)]

#compare with real data
prediction2={}
for (i in 1:12){
  print(fcast4$mean[i])
  prediction2=append(fcast4$mean[i], prediction2)
}
## [1] 20.289
## [1] 19.80259
## [1] 19.64663
## [1] 19.51783
## [1] 19.39801
## [1] 19.28565
## [1] 19.18023
## [1] 19.08132
## [1] 18.98852
## [1] 18.90145
## [1] 18.81975
## [1] 18.74311
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.3731521
## [1] 0.3731521
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 4.737
round(N_RMSE1, digits = 3) #0.526
## [1] 0.526
#make prediction
xreg4_pred=forecast(final_mod3, xreg=cbind(rep(mean(new_merged$temp), 12), rep(mean(new_merged$cpi), 12), rep(mean(new_merged$non_rhc),12), rep(mean(new_merged$time),12)))
## Warning in forecast.forecast_ARIMA(final_mod3, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg4_pred)+xlab("Time")+ylab("anti-Asian hate crimes")

xreg4_pred=forecast(final_mod3, xreg=cbind(rep(mean(new_merged$temp), 24), rep(mean(new_merged$cpi), 24), rep(mean(new_merged$non_rhc),24), rep(mean(new_merged$time),24)))
## Warning in forecast.forecast_ARIMA(final_mod3, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg4_pred)+xlab("Time")+ylab("anti-Asian hate crimes")

3. GARCH

hcR=diff(log(allHC$hate_crime))
tsplot(hcR, col="lightcoral")

blackhcR=diff(log(blackHC$hate_crime))
tsplot(blackhcR, col="lightcoral")

#GARCH for asian HC
asianhcR=diff(log(asianHC$hate_crime))
tsplot(asianhcR,col="lightcoral")

acf(asianhcR) #lag 1 sig, MA(1)

pacf(asianhcR) #lag 12 sig.several lags at front

ahcMod1=Arima(asianhcR, order=c(0,0,1))
acf(ahcMod1$residuals , lag=38) #looks good, almost no correlation

pacf(ahcMod1$residuals, lag=38) #lag 12, 22, 24 sig. Try SARIMA? Try simple way and see if GRACH fix it

tsplot(ahcMod1$residuals) #slight heteroskedasticity

res1 =ahcMod1$residuals
tsplot(res1^2)

acf(res1^2, lag=50) #lag 32 sig. 1/50, almost no autocorrelation. CAN'T USE GARCH!

pacf(res1^2, lag=50) #lag 31 sig. 1/50 

auto.arima(res1^2) #suggest ARIMA(0,0,1)
## Series: res1^2 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1    mean
##       0.0815  0.1102
## s.e.  0.0521  0.0110
## 
## sigma^2 = 0.03738:  log likelihood = 81.53
## AIC=-157.06   AICc=-156.99   BIC=-145.41
#try garch(1,0) 
#ahcFit1=garchFit(~arma(0,1) + garch(1,0), data = asianhcR)
#summary(ahcFit1) #0.6664991 0.7097673
#plot(ahcFit1,which="all") #no significant autocorrelation remains at plot 10 and 11. Really small lag at lag 12. Residuals looks pretty normal but have outliers

#try garch(1,0) 
#ahcFit2=garchFit(~arma(0,1) + garch(0,1), data = asianhcR) #Error, does not allow me to try garch(0,1)
#summary(ahcFit2) #0.6664991 0.7097673
#plot(ahcFit2,which="all") #no significant autocorrelation remains at plot 10 and 11. Really small lag at lag 12. Residuals looks pretty normal but have outliers

#try garch(1,1)
#ahcFit2=garchFit(~arma(0,1) + garch(1,1), data = asianhcR) #Error, does not allow me to try garch(0,1)
#summary(ahcFit2) #0.6720701 0.7261554 got worse
#plot(ahcFit2,which="all") #no significant autocorrelation remains at plot 10 and 11. Really small lag at lag 12. Residuals looks pretty normal but have outliers

#what if I fit SARIMA? Can I do this?

#plot(ahcFit1, which=3)   

#ahcFit1$

4. ARFIMA

tsplot(allHC$hate_crime)

acf(allHC$hate_crime, 50, main="all racial hate crime")

tsplot(blackHC$hate_crime)

acf(blackHC$hate_crime,50, main="anti-Black or African American hate crime")

tsplot(asianHC$hate_crime)

acf(asianHC$hate_crime,50, main="anti-Asian hate crime")

library(arfima)
## Loading required package: ltsa
## Note that the arfima package has new defaults starting with
## 1.4-0: type arfimachanges() for a list, as well as some other notes.
## NOTE: some of these are quite important!
## 
## Attaching package: 'arfima'
## The following object is masked from 'package:forecast':
## 
##     arfima
## The following object is masked from 'package:stats':
## 
##     BIC
summary(arfimaMod1 <- arfima(allHC$hate_crime, order = c(0,0,0))) #d=0.4757504
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
## 
## Call:
##  
## arfima(z = allHC$hate_crime, order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##                Estimate  Std. Error Th. Std. Err. z-value Pr(>|z|)    
## d.f           0.4757504   0.0275354     0.0410974 17.2778  < 2e-16 ***
## Fitted mean 389.0534119 189.9210362            NA  2.0485 0.040511 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 7050.6; Log-likelihood = -1595.65; AIC = 3197.3; BIC = 3208.96
## 
## Numerical Correlations of Coefficients:
##             d.f  Fitted mean
## d.f         1.00 0.02       
## Fitted mean 0.02 1.00       
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.64
summary(arfimaMod2 <- arfima(blackHC$hate_crime, order = c(0,0,0))) #d=0.4911473
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
## 
## Call:
##  
## arfima(z = blackHC$hate_crime, order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##                Estimate  Std. Error Th. Std. Err. z-value Pr(>|z|)    
## d.f           0.4911473   0.0118870     0.0410974 41.3181  < 2e-16 ***
## Fitted mean 207.6410160 166.7932852            NA  1.2449  0.21317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 1702.71; Log-likelihood = -1340.42; AIC = 2686.85; BIC = 2698.5
## 
## Numerical Correlations of Coefficients:
##             d.f  Fitted mean
## d.f         1.00 0.03       
## Fitted mean 0.03 1.00       
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.64
summary(arfimaMod3 <- arfima(asianHC$hate_crime, order = c(0,0,0))) #d=0.3988466
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
## 
## Call:
##  
## arfima(z = asianHC$hate_crime, order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##               Estimate Std. Error Th. Std. Err.  z-value   Pr(>|z|)    
## d.f          0.3988466  0.0352278     0.0410974 11.32192 < 2.22e-16 ***
## Fitted mean 18.1603855  4.4192760            NA  4.10936 3.9676e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 34.2928; Log-likelihood = -636.143; AIC = 1278.29; BIC = 1289.95
## 
## Numerical Correlations of Coefficients:
##             d.f  Fitted mean
## d.f         1.00 0.00       
## Fitted mean 0.00 1.00       
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.64
r = resid(arfimaMod1)[[1]]
acf(r, 50) # seasonal component remaining

r2 = resid(arfimaMod2)[[1]]
acf(r2, 50) # seasonal component remaining

#Seems like asianHC could use ARFIMA
r3 = resid(arfimaMod3)[[1]]
tsplot(r3)

acf(r3, 50) # looks good but really small lags in 12 period

pacf(r3, 50) #looks good but 12, 23 concerning but small

adf.test(r3)
## Warning in adf.test(r3): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  r3
## Dickey-Fuller = -6.5764, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(r3) #fail
## 
##  KPSS Test for Level Stationarity
## 
## data:  r3
## KPSS Level = 0.53845, Truncation lag parameter = 5, p-value = 0.03301
pp.test(r3)
## Warning in pp.test(r3): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  r3
## Dickey-Fuller Z(alpha) = -369.42, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
summary(ur.ers(r3))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.344  -2.930   0.855   4.632  37.265 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -0.19313    0.05577  -3.463   0.0006 ***
## yd.diff.lag1 -0.67702    0.06840  -9.899  < 2e-16 ***
## yd.diff.lag2 -0.52171    0.07279  -7.167 4.57e-12 ***
## yd.diff.lag3 -0.30353    0.06808  -4.459 1.11e-05 ***
## yd.diff.lag4 -0.12892    0.05273  -2.445   0.0150 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.337 on 350 degrees of freedom
## Multiple R-squared:  0.4369, Adjusted R-squared:  0.4288 
## F-statistic:  54.3 on 5 and 350 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -3.4631 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(r3)

## [1] 351 352
#compare with ARIMA way
daHC=diff(asianHC$hate_crime)
acf(daHC) #MA(1)

pacf(daHC) #a lot of lags, tailing off like an MA(2)?

auto.arima(daHC) #(2,0,2) too complicated, overdifferenced!
## Series: daHC 
## ARIMA(2,0,2) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2
##       0.5687  -0.0288  -1.2067  0.2743
## s.e.  0.5477   0.1608   0.5446  0.4839
## 
## sigma^2 = 34.17:  log likelihood = -1141.78
## AIC=2293.57   AICc=2293.74   BIC=2312.99
#prediction on 2021 and 2022
fcast_arfima=predict(arfimaMod3, n.ahead=24)
fcast_arfima
## $`Mode 1`
## $`Mode 1`$`Forecasts and SDs`
##                    1        2        3        4        5        6        7
## Forecasts   18.17083 18.59330 18.71884 18.73874 18.71289 18.66593 18.60949
## Exact SD     5.85730  6.30696  6.51608  6.64662  6.73950  6.81066  6.86786
## Limiting SD  5.85601  6.30461  6.51282  6.64252  6.73462  6.80504  6.86153
##                    8        9       10       11       12       13       14
## Forecasts   18.54950 18.48913 18.43007 18.37323 18.31908 18.26780 18.21943
## Exact SD     6.91540  6.95588  6.99102  7.02197  7.04957  7.07444  7.09702
## Limiting SD  6.90838  6.94821  6.98270  7.01303  7.04003  7.06430  7.08631
##                   15       16       17       18       19       20       21
## Forecasts   18.17392 18.13115 18.09101 18.05335 18.01802 17.98487 17.95376
## Exact SD     7.11769  7.13672  7.15433  7.17071  7.18601  7.20035  7.21384
## Limiting SD  7.10640  7.12487  7.14194  7.15778  7.17255  7.18637  7.19935
##                   22       23       24
## Forecasts   17.92456 17.89714 17.87139
## Exact SD     7.22657  7.23862  7.25005
## Limiting SD  7.21158  7.22312  7.23406
final_predict=data.frame("fit"=c( 18.17083, 18.59330, 18.71884, 18.73874 ,18.71289 ,18.66593 ,18.60949, 18.54950, 18.48913, 18.43007, 18.37323 ,18.31908, 18.26780 ,18.21943 ,18.17392, 18.13115, 18.09101 ,18.05335 ,18.01802, 17.98487, 17.95376 ,17.92456, 17.89714, 17.87139), "sd"=c( 5.85730,  6.30696,  6.51608 , 6.64662 , 6.73950 , 6.81066  ,6.86786 , 6.91540,  6.95588  ,6.99102 , 7.02197  ,7.04957 , 7.07444  ,7.09702  ,7.11769 , 7.13672,   7.15433 , 7.17071  ,7.18601  ,7.20035 , 7.21384,  7.22657 , 7.23862  ,7.25005))

fitted=ts(final_predict$fit, start=c(2021,1), frequency=12)

fit_df=data.frame("lwr"=rep(0,24), "upr"=rep(0,24))
for (i in 1:24){
  fit_df$lwr[i]=final_predict$fit[i]-1.96*final_predict$sd[i]
  fit_df$upr[i]=final_predict$fit[i]+1.96*final_predict$sd[i]
}


lower_list=ts(fit_df$lwr,start=c(2021,1),frequency =12)
upper_list=ts(fit_df$upr,start=c(2021,1),frequency =12)
#NA_list=ts(rep(NA, 12),start=c(2017,7),frequency =12)
#merged_ts <- ts(c(salmon, NA_list),               
#   start = start(salmon),
#   frequency = frequency(salmon))
asianHC_ts=ts(asianHC$hate_crime, start=c(1991,1), frequency=12)
NA_list=ts(rep(NA, 24),start=c(2021,1),frequency =12)
merged_ts <- ts(c(asianHC_ts, NA_list),               
   start = start(asianHC_ts),
   frequency = frequency(asianHC_ts))

tsplot(merged_ts,xlim=c())
lines(fitted, start=c(2021,1), end=c(2022,12), col="red")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "start" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "end" is not a graphical
## parameter
abline(h=0, col=8) # makes a horizontal line y = 0
lines(lower_list, col="blue")
abline(h=0, col=8)
lines(upper_list, col="blue")                 

#ARFIMA on seasonal diff. data??
acf(diff(allHC$hate_crime, lag=12))

acf(diff(blackHC$hate_crime, lag=12))

acf(diff(asianHC$hate_crime, lag=12))

summary(arfimaMod1.2 <- arfima(diff(allHC$hate_crime, lag=12), order = c(0,0,0))) #d=0.3878510
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
## 
## Call:
##  
## arfima(z = diff(allHC$hate_crime, lag = 12), order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##               Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)    
## d.f          0.3878510  0.0445982     0.0417971 8.69656  < 2e-16 ***
## Fitted mean  9.1924449 64.1638528            NA 0.14327  0.88608    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 8517.79; Log-likelihood = -1574.47; AIC = 3154.94; BIC = 3166.5
## 
## Numerical Correlations of Coefficients:
##             d.f   Fitted mean
## d.f          1.00 -0.20      
## Fitted mean -0.20  1.00      
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.64
summary(arfimaMod2.2 <- arfima(diff(blackHC$hate_crime, lag=12), order = c(0,0,0))) #d=0.4749521
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
## 
## Call:
##  
## arfima(z = diff(blackHC$hate_crime, lag = 12), order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##               Estimate Std. Error Th. Std. Err.  z-value Pr(>|z|)    
## d.f          0.4749521  0.0284903     0.0417971 16.67063  < 2e-16 ***
## Fitted mean  5.8687671 91.1756546            NA  0.06437  0.94868    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 1617.35; Log-likelihood = -1286.28; AIC = 2578.56; BIC = 2590.11
## 
## Numerical Correlations of Coefficients:
##             d.f   Fitted mean
## d.f          1.00 -0.21      
## Fitted mean -0.21  1.00      
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.64
summary(arfimaMod3.2 <- arfima(diff(asianHC$hate_crime, lag=12), order = c(0,0,0))) #d=0.2416897
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
## 
## Call:
##  
## arfima(z = diff(asianHC$hate_crime, lag = 12), order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##              Estimate Std. Error Th. Std. Err. z-value  Pr(>|z|)    
## d.f         0.2416897  0.0435546     0.0417971 5.54912 2.871e-08 ***
## Fitted mean 0.4908735  1.6364596            NA 0.29996   0.76421    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 53.5331; Log-likelihood = -691.821; AIC = 1389.64; BIC = 1401.2
## 
## Numerical Correlations of Coefficients:
##             d.f  Fitted mean
## d.f         1.00 0.04       
## Fitted mean 0.04 1.00       
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.64
r.2 = resid(arfimaMod1.2)[[1]]
acf(r.2, 50) #huge lag at 12. 

pacf(r.2, 50) #12, 24

r2.2 = resid(arfimaMod2.2)[[1]]
acf(r2.2, 50) #huge lag at 12 

pacf(r2.2, 50) #12, 1

r3.2 = resid(arfimaMod3.2)[[1]]
acf(r3.2, 50) #huge lag at 12

pacf(r3.2,50) #some seasonal lags

#trial cases: can I fit SARIMA to residual?
trail_allHC <- arfima(diff(allHC$hate_crime, lag=12), order = c(1,0,0))
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
trail_allHC
## Number of modes: 1 
## 
## Call:
## arfima(z = diff(allHC$hate_crime, lag = 12), order = c(1, 0, 0))
## 
## Coefficients for fits:
##              Coef.1:    SE.1:     
## phi(1)        0.139969   0.117196 
## d.f           0.291948   0.0967529
## Fitted mean   9.22225    34.9748  
## logl         -1573.55             
## sigma^2       8514.31             
## Starred fits are close to invertibility/stationarity boundaries
tsplot(resid(trail_allHC)[[1]])

acf(resid(trail_allHC)[[1]]) #still, lag 12 concerned

#compare
compare1=auto.arima(diff(asianHC$hate_crime, lag=12)) #(1,1)
acf(compare1$residuals) #still, lag 12

pacf(compare1$residuals) #still, lag 12

trail_blackHC <- arfima(diff(blackHC$hate_crime, lag=12), order = c(1,0,0))
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
trail_allHC
## Number of modes: 1 
## 
## Call:
## arfima(z = diff(allHC$hate_crime, lag = 12), order = c(1, 0, 0))
## 
## Coefficients for fits:
##              Coef.1:    SE.1:     
## phi(1)        0.139969   0.117196 
## d.f           0.291948   0.0967529
## Fitted mean   9.22225    34.9748  
## logl         -1573.55             
## sigma^2       8514.31             
## Starred fits are close to invertibility/stationarity boundaries
tsplot(resid(trail_allHC)[[1]])

acf(resid(trail_allHC)[[1]]) #still, lag 12 concerned

trail_asianHC <- arfima(diff(asianHC$hate_crime, lag=12), order = c(1,0,0))
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
trail_asianHC
## Number of modes: 1 
## 
## Call:
## arfima(z = diff(asianHC$hate_crime, lag = 12), order = c(1, 0, 0))
## 
## Coefficients for fits:
##              Coef.1:      SE.1:     
## phi(1)        0.00604986   0.0973575
## d.f           0.237606     0.078903 
## Fitted mean   0.482695     1.60599  
## logl         -691.819               
## sigma^2       53.69                 
## Starred fits are close to invertibility/stationarity boundaries
tsplot(resid(trail_asianHC)[[1]])

acf(resid(trail_asianHC)[[1]]) #still, lag 12 concerned

autoMod = auto.arima(diff(asianHC$hate_crime, lag=12))
summary(autoMod)
## Series: diff(asianHC$hate_crime, lag = 12) 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ma1
##       0.7439  -0.5025
## s.e.  0.0860   0.1099
## 
## sigma^2 = 53.04:  log likelihood = -1183.82
## AIC=2373.64   AICc=2373.71   BIC=2385.2
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.07539184 7.261722 5.429604 NaN  Inf 0.7702667 -0.006083782
acf(autoMod$residuals)

pacf(autoMod$residuals)

#In general, ARFIMA on seasonally differenced data have concern remaining with significant lag at 12. This could be fixed with using SARIMA. However, no info given on those.
#CCF with non racial hate crimes and racial hate crimes?
#SARIMA to non racial hate crimes?
#boot strap forecast interval of SARIMA model.

5. Cross-validation

all racial hate crime: SARIMA and xreg

#CV, SARIMA and Xreg
#best SARIMA model
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(mod9HC) #AIC=4017.1   AICc=4017.27   BIC=4036.36

#compare forecasting performance of two models
a=allHC[1:312,] #including only through 2016
t=1:312
sarimaMod_allHC=Arima(a$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE) 
#summary(sarimaMod) #AIC=3780.98   AICc=3781.16   BIC=3800.06
sarimaPred=forecast(sarimaMod_allHC,h=48)$mean

#real data
real_value=allHC$hate_crime[c(313:360)]

#compare with real data
cor(real_value,sarimaPred)^2 # R^2 = 0.1606327
## [1] 0.1606327
RMSE1=sqrt(mean((real_value - sarimaPred)^2))
N_RMSE1=RMSE1/(max(real_value)-min(real_value))
round(RMSE1 , digits = 3)
## [1] 149.781
round(N_RMSE1, digits = 3) #0.172
## [1] 0.172
#cross-validation
a=new_merged$hc[1:312,] #including only through 2018
sarimaMod=Arima(new_merged$hc[1:312,],order=c(1,0,1), xreg =cbind(new_merged$temp[1:312,], new_merged$unemp[1:312,], new_merged$cpi[1:312,], new_merged$non_rhc[1:312,]))
summary(sarimaMod) #AICc=3592.9   BIC=3622.99
## Series: new_merged$hc[1:312, ] 
## Regression with ARIMA(1,0,1) errors 
## 
## Coefficients:
##         ar1      ma1  intercept  new_merged$temp[1:312, ]
##       0.928  -0.7376   354.5704                    1.4847
## s.e.  0.042   0.0674    68.9848                    0.2074
##       new_merged$unemp[1:312, ]  new_merged$cpi[1:312, ]
##                          0.4911                  -1.7716
## s.e.                     5.1580                   0.3061
##       new_merged$non_rhc[1:312, ]
##                            1.3545
## s.e.                       0.0751
## 
## sigma^2 = 2562:  log likelihood = -1663.74
## AIC=3343.49   AICc=3343.96   BIC=3373.43
## 
## Training set error measures:
##                     ME    RMSE      MAE        MPE     MAPE      MASE
## Training set 0.9564581 50.0412 36.81672 -0.8859321 9.639288 0.7367609
##                    ACF1
## Training set 0.02037902
fcast2 <- forecast(sarimaMod, xreg=cbind(rep(mean(new_merged$temp[1:312,]), 48), rep(mean(new_merged$unemp[1:312,]), 48), rep(mean(new_merged$cpi[1:312,]),48), rep(mean(new_merged$non_rhc[1:312,]),48))) 
## Warning in forecast.forecast_ARIMA(sarimaMod, xreg =
## cbind(rep(mean(new_merged$temp[1:312, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast2)

#real data
real_value=new_merged$hc[c(313:360)]

#compare with real data
prediction2={}
for (i in 1:48){
  print(fcast2$mean[i])
  prediction2=append(fcast2$mean[i], prediction2)
}
## [1] 377.3941
## [1] 377.9822
## [1] 378.528
## [1] 379.0344
## [1] 379.5044
## [1] 379.9406
## [1] 380.3453
## [1] 380.721
## [1] 381.0695
## [1] 381.393
## [1] 381.6932
## [1] 381.9717
## [1] 382.2303
## [1] 382.4702
## [1] 382.6928
## [1] 382.8994
## [1] 383.0911
## [1] 383.269
## [1] 383.4341
## [1] 383.5873
## [1] 383.7295
## [1] 383.8615
## [1] 383.9839
## [1] 384.0975
## [1] 384.203
## [1] 384.3009
## [1] 384.3917
## [1] 384.4759
## [1] 384.5541
## [1] 384.6267
## [1] 384.6941
## [1] 384.7566
## [1] 384.8146
## [1] 384.8684
## [1] 384.9183
## [1] 384.9647
## [1] 385.0077
## [1] 385.0476
## [1] 385.0847
## [1] 385.119
## [1] 385.1509
## [1] 385.1805
## [1] 385.208
## [1] 385.2335
## [1] 385.2572
## [1] 385.2791
## [1] 385.2995
## [1] 385.3184
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 =  0.1112606
## [1] 0.1112606
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 149.678
round(N_RMSE1, digits = 3) #0.172
## [1] 0.172

anti-Black or African American hate crime: SARIMA and xreg

#CV
#compare forecasting performance of two models
b=blackHC[1:312,] #including only through 2019
t=1:312

sarimaModB=Arima(b$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE) 
#summary(sarimaModB) #AIC=3072.28   AICc=3072.4   BIC=3087.53
sarimaBPred=forecast(sarimaModB,h=48)$mean

#real data
real_value2=blackHC$hate_crime[c(313:360)]

#compare with real data
cor(real_value2,sarimaBPred)^2 # R^2 = 0.1172818
## [1] 0.1172818
#cross-validation
a=new_merged$blackhc[1:312,] #including only through 2019
t=1:312
sarimaMod2=Arima(new_merged$blackhc[1:312,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:312,], new_merged$unemp[1:312,], new_merged$cpi[1:312,], new_merged$non_rhc[1:312,]))
summary(sarimaMod2) #AICc=3718.89   BIC=3749.28
## Series: new_merged$blackhc[1:312, ] 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  new_merged$temp[1:312, ]
##       1.1880  -0.1953  -0.7596   -63.0393                    1.1456
## s.e.  0.0836   0.0813   0.0549   267.0067                    0.1282
##       new_merged$unemp[1:312, ]  new_merged$cpi[1:312, ]
##                          3.1408                   0.6119
## s.e.                     3.6133                   1.2973
##       new_merged$non_rhc[1:312, ]
##                            0.2551
## s.e.                       0.0393
## 
## sigma^2 = 693.8:  log likelihood = -1460.35
## AIC=2938.71   AICc=2939.3   BIC=2972.39
## 
## Training set error measures:
##                      ME     RMSE     MAE       MPE     MAPE     MASE
## Training set 0.09864195 26.00008 20.1361 -1.235001 9.849989 0.745604
##                     ACF1
## Training set 0.002420022
fcast3 <- forecast(sarimaMod2, xreg=cbind(rep(mean(new_merged$temp[1:312,]), 48), rep(mean(new_merged$unemp[1:312,]), 48), rep(mean(new_merged$cpi[1:312,]),48), rep(mean(new_merged$non_rhc[1:312,]),48))) 
## Warning in forecast.forecast_ARIMA(sarimaMod2, xreg =
## cbind(rep(mean(new_merged$temp[1:312, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast3)

#real data
real_value=new_merged$blackhc[c(313:360)]

#compare with real data
prediction2={}
for (i in 1:48){
  print(fcast3$mean[i])
  prediction2=append(fcast3$mean[i], prediction2)
}
## [1] 125.9071
## [1] 126.8751
## [1] 127.4958
## [1] 128.0442
## [1] 128.5745
## [1] 129.0974
## [1] 129.615
## [1] 130.1279
## [1] 130.6362
## [1] 131.1398
## [1] 131.6389
## [1] 132.1335
## [1] 132.6237
## [1] 133.1094
## [1] 133.5907
## [1] 134.0677
## [1] 134.5403
## [1] 135.0087
## [1] 135.4729
## [1] 135.9329
## [1] 136.3887
## [1] 136.8405
## [1] 137.2881
## [1] 137.7317
## [1] 138.1713
## [1] 138.6069
## [1] 139.0386
## [1] 139.4664
## [1] 139.8904
## [1] 140.3105
## [1] 140.7268
## [1] 141.1394
## [1] 141.5482
## [1] 141.9533
## [1] 142.3548
## [1] 142.7527
## [1] 143.147
## [1] 143.5377
## [1] 143.9249
## [1] 144.3086
## [1] 144.6888
## [1] 145.0656
## [1] 145.439
## [1] 145.809
## [1] 146.1757
## [1] 146.5391
## [1] 146.8992
## [1] 147.256
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.2034128
## [1] 0.2034128
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 115.105
round(N_RMSE1, digits = 3) 
## [1] 0.199

anti-Asian hate crime: SARIMA, xreg, and ARFIMA

#CV
#best SARIMA model for the diff data: 
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
#summary(mod10A) #AIC=2285.7   AICc=2285.87   BIC=2305.12 

#compare forecasting performance of two models
c=asianHC[1:312,] #including only through 2019
t=1:312
sarimaModA=Arima(c$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift = FALSE) 
#summary(sarimaModA) #AIC=2148.5   AICc=2148.67   BIC=2167.74
sarimaAPred=forecast(sarimaModA,h=48)$mean

#real data
real_value3=asianHC$hate_crime[c(313:360)]

#compare with real data
cor(real_value3,sarimaAPred)^2 # R^2 =[1] 0.03355379, might be due to surge in 2020?
## [1] 0.03355379
#cross-validation
a=new_merged$asianhc[1:312,] #including only through 2019
t=1:312
sarimaMod3=Arima(new_merged$asianhc[1:312,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:312,], new_merged$cpi[1:312,], new_merged$non_rhc[1:312,], new_merged$time[1:312,]))
summary(sarimaMod3) # AICc=2055.28   BIC=2089.09
## Series: new_merged$asianhc[1:312, ] 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  new_merged$temp[1:312, ]
##       1.0259  -0.0828  -0.8119    33.0759                    0.0306
## s.e.  0.1030   0.0757   0.0832    30.4239                    0.0229
##       new_merged$cpi[1:312, ]  new_merged$non_rhc[1:312, ]
##                       -0.1300                       0.0487
## s.e.                   0.2322                       0.0075
##       new_merged$time[1:312, ]
##                        -0.0152
## s.e.                    0.0841
## 
## sigma^2 = 26.29:  log likelihood = -948.83
## AIC=1915.66   AICc=1916.26   BIC=1949.35
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE     MASE
## Training set 0.07171987 5.061236 3.963485 -9.067421 26.34634 0.763247
##                      ACF1
## Training set -0.002219476
fcast4 <- forecast(sarimaMod3, xreg=cbind(rep(mean(new_merged$temp[1:312,]), 48), rep(mean(new_merged$cpi[1:312,]), 48), rep(mean(new_merged$non_rhc[1:312,]),48), rep(mean(new_merged$time[1:312,]),48))) 
## Warning in forecast.forecast_ARIMA(sarimaMod3, xreg =
## cbind(rep(mean(new_merged$temp[1:312, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast4)

#real data
real_value=new_merged$asianhc[c(313:360)]

#compare with real data
prediction2={}
for (i in 1:48){
  print(fcast4$mean[i])
  prediction2=append(fcast4$mean[i], prediction2)
}
## [1] 18.36
## [1] 18.11921
## [1] 18.0885
## [1] 18.07694
## [1] 18.06763
## [1] 18.05903
## [1] 18.05098
## [1] 18.04343
## [1] 18.03636
## [1] 18.02972
## [1] 18.0235
## [1] 18.01767
## [1] 18.0122
## [1] 18.00707
## [1] 18.00227
## [1] 17.99776
## [1] 17.99353
## [1] 17.98957
## [1] 17.98586
## [1] 17.98237
## [1] 17.97911
## [1] 17.97605
## [1] 17.97318
## [1] 17.97048
## [1] 17.96796
## [1] 17.96559
## [1] 17.96338
## [1] 17.9613
## [1] 17.95934
## [1] 17.95752
## [1] 17.9558
## [1] 17.95419
## [1] 17.95269
## [1] 17.95127
## [1] 17.94995
## [1] 17.94871
## [1] 17.94754
## [1] 17.94645
## [1] 17.94543
## [1] 17.94447
## [1] 17.94357
## [1] 17.94272
## [1] 17.94193
## [1] 17.94119
## [1] 17.94049
## [1] 17.93984
## [1] 17.93923
## [1] 17.93866
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.1798148
## [1] 0.1798148
#for asian hate crime, try CV using the arfima model
t=1:312
arfimaMod_ahc=arfima(asianHC$hate_crime[1:312], order = c(0,0,0)) 
## Note: only one starting point.  Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
summary(arfimaMod_ahc) #AICc=3718.89   BIC=3749.28
## 
## Call:
##  
## arfima(z = asianHC$hate_crime[1:312], order = c(0, 0, 0))
## 
## 
## Mode 1 Coefficients:
##               Estimate Std. Error Th. Std. Err.  z-value   Pr(>|z|)    
## d.f          0.3785830  0.0350286     0.0441362 10.80784 < 2.22e-16 ***
## Fitted mean 17.3546586  3.5608988            NA  4.87367 1.0954e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 31.5065; Log-likelihood = -537.957; AIC = 1081.91; BIC = 1093.14
## 
## Numerical Correlations of Coefficients:
##             d.f   Fitted mean
## d.f          1.00 -0.03      
## Fitted mean -0.03  1.00      
## 
## Theoretical Correlations of Coefficients:
##     d.f 
## d.f 1.00
## 
## Expected Fisher Information Matrix of Coefficients:
##     d.f 
## d.f 1.65
fcast2 <- predict(arfimaMod_ahc, n.ahead=48) 
fcast2
## $`Mode 1`
## $`Mode 1`$`Forecasts and SDs`
##                    1        2        3        4        5        6        7
## Forecasts   12.18301 12.25992 12.36044 12.46772 12.57443 12.67774 12.77663
## Exact SD     5.61435  6.00413  6.18113  6.29011  6.36689  6.42528  6.47193
## Limiting SD  5.61306  6.00185  6.17800  6.28621  6.36228  6.42001  6.46602
##                    8        9       10       11       12       13       14
## Forecasts   12.87084 12.96046 13.04571 13.12687 13.20421 13.27802 13.34856
## Exact SD     6.51050  6.54320  6.57147  6.59629  6.61835  6.63817  6.65613
## Limiting SD  6.50399  6.53611  6.56382  6.58809  6.60963  6.62893  6.64639
##                   15       16       17       18       19       20       21
## Forecasts   13.41606 13.48074 13.54280 13.60242 13.65977 13.71499 13.76822
## Exact SD     6.67251  6.68757  6.70147  6.71438  6.72642  6.73768  6.74825
## Limiting SD  6.66229  6.67686  6.69030  6.70275  6.71433  6.72515  6.73529
##                   22       23       24       25       26       27       28
## Forecasts   13.81959 13.86921 13.91718 13.96359 14.00853 14.05209 14.09433
## Exact SD     6.75822  6.76763  6.77655  6.78501  6.79307  6.80075  6.80808
## Limiting SD  6.74483  6.75382  6.76232  6.77038  6.77804  6.78532  6.79227
##                  29       30       31       32       33       34       35
## Forecasts   14.1353 14.17515 14.21384 14.25147 14.28807 14.32370 14.35841
## Exact SD     6.8151  6.82183  6.82828  6.83449  6.84046  6.84621  6.85176
## Limiting SD  6.7989  6.80525  6.81133  6.81716  6.82277  6.82816  6.83335
##                   36       37       38       39       40       41       42
## Forecasts   14.39223 14.42520 14.45736 14.48874 14.51937 14.54929 14.57852
## Exact SD     6.85712  6.86229  6.86731  6.87216  6.87686  6.88143  6.88586
## Limiting SD  6.83836  6.84319  6.84786  6.85237  6.85674  6.86097  6.86507
##                   43       44       45       46       47      48
## Forecasts   14.60709 14.63502 14.66234 14.68907 14.71522 14.7408
## Exact SD     6.89016  6.89434  6.89841  6.90237  6.90623  6.9100
## Limiting SD  6.86905  6.87291  6.87666  6.88030  6.88385  6.8873
predictedDF=data.frame("fit"= c(12.18301, 12.25992, 12.36044, 12.46772, 12.57443, 12.67774 ,12.77663, 12.87084, 12.96046, 13.04571, 13.12687, 13.20421, 13.27802, 13.34856, 13.41606, 13.48074, 13.54280, 13.60242, 13.65977, 13.71499, 13.76822, 13.81959, 13.86921, 13.91718, 13.96359, 14.00853, 14.05209, 14.09433, 14.1353, 14.17515, 14.21384, 14.25147,14.28807, 14.32370, 14.35841 ,14.39223, 14.42520 ,14.45736 ,14.48874, 14.51937 ,14.54929, 14.57852, 14.60709 ,14.63502 ,14.66234, 14.68907 ,14.71522, 14.7408))
#real data
real_value=asianHC$hate_crime[c(313:360)]

#compare with real data
cv_df1=data.frame("predict"=predictedDF$fit, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.302834
## [1] 0.302834